Exemplo n.º 1
0
def check_validity_of_conll_bioes(bioes_filepath):
    dataset_type = utils.get_basename_without_extension(bioes_filepath).split(
        '_')[0]
    print("Checking validity of CONLL BIOES format... ".format(dataset_type),
          end='')

    input_conll_file = codecs.open(bioes_filepath, 'r', 'UTF-8')
    labels_bioes = []
    labels_bio = []
    for line in input_conll_file:
        split_line = line.strip().split(' ')
        # New sentence
        if len(split_line) == 0 or len(
                split_line[0]) == 0 or '-DOCSTART-' in split_line[0]:
            if check_bio_bioes_compatibility(labels_bio, labels_bioes):
                continue
            return False
        label_bioes = split_line[-1]
        label_bio = split_line[-2]
        labels_bioes.append(label_bioes)
        labels_bio.append(label_bio)
    input_conll_file.close()
    if check_bio_bioes_compatibility(labels_bio, labels_bioes):
        print("Done.")
        return True
    return False
Exemplo n.º 2
0
def convert_conll_from_bio_to_bioes(input_conll_filepath,
                                    output_conll_filepath):
    if os.path.exists(output_conll_filepath):
        if check_validity_of_conll_bioes(output_conll_filepath):
            return
    dataset_type = utils.get_basename_without_extension(
        input_conll_filepath).split('_')[0]
    print("Converting CONLL from BIO to BIOES format... ".format(dataset_type),
          end='')
    input_conll_file = codecs.open(input_conll_filepath, 'r', 'UTF-8')
    output_conll_file = codecs.open(output_conll_filepath, 'w', 'UTF-8')

    labels = []
    split_lines = []
    for line in input_conll_file:
        split_line = line.strip().split(' ')
        # New sentence
        if len(split_line) == 0 or len(
                split_line[0]) == 0 or '-DOCSTART-' in split_line[0]:
            output_conll_lines_with_bioes(split_lines, labels,
                                          output_conll_file)
            output_conll_file.write(line)
            continue
        label = split_line[-1]
        labels.append(label)
        split_lines.append(split_line)
    output_conll_lines_with_bioes(split_lines, labels, output_conll_file)

    input_conll_file.close()
    output_conll_file.close()
    print("Done.")
Exemplo n.º 3
0
def generate_reference_text_file_for_conll(conll_input_filepath,
                                           conll_output_filepath, text_folder):
    '''
    generates reference text files and adds the corresponding filename and token offsets to conll file.
    
    conll_input_filepath: path to a conll-formatted file without filename and token offsets
    text_folder: folder to write the reference text file to
    '''
    dataset_type = utils.get_basename_without_extension(conll_input_filepath)
    conll_file = codecs.open(conll_input_filepath, 'r', 'UTF-8')
    utils.create_folder_if_not_exists(text_folder)
    text = ''
    new_conll_string = ''
    character_index = 0
    document_count = 0
    text_base_filename = '{0}_text_{1}'.format(dataset_type,
                                               str(document_count).zfill(5))
    for line in conll_file:
        split_line = line.strip().split(' ')
        # New document
        if '-DOCSTART-' in split_line[0]:
            new_conll_string += line
            if len(text) != 0:
                with codecs.open(
                        os.path.join(text_folder,
                                     '{0}.txt'.format(text_base_filename)),
                        'w', 'UTF-8') as f:
                    f.write(text)
            text = ''
            character_index = 0
            document_count += 1
            text_base_filename = '{0}_text_{1}'.format(
                dataset_type,
                str(document_count).zfill(5))
            continue
            # New sentence
        elif len(split_line) == 0 or len(split_line[0]) == 0:
            new_conll_string += '\n'
            if text != '':
                text += '\n'
                character_index += 1
            continue
        token = split_line[0]
        start = character_index
        end = start + len(token)
        text += token + ' '
        character_index += len(token) + 1
        new_conll_string += ' '.join(
            [token, text_base_filename,
             str(start), str(end)] + split_line[1:]) + '\n'
    if len(text) != 0:
        with codecs.open(
                os.path.join(text_folder,
                             '{0}.txt'.format(text_base_filename)), 'w',
                'UTF-8') as f:
            f.write(text)
    conll_file.close()

    with codecs.open(conll_output_filepath, 'w', 'UTF-8') as f:
        f.write(new_conll_string)
Exemplo n.º 4
0
def xml_to_brat(input_folder, output_folder, overwrite=True):
    print('input_folder: {0}'.format(input_folder))
    start_time = time.time()
    if overwrite:
        shutil.rmtree(output_folder, ignore_errors=True)
    utils.create_folder_if_not_exists(output_folder)

    for input_filepath in sorted(glob.glob(os.path.join(input_folder, '*.xml'))):
        filename = utils.get_basename_without_extension(input_filepath)
        output_text_filepath = os.path.join(output_folder, '{0}.txt'.format(filename))
        xmldoc = xml.etree.ElementTree.parse(input_filepath).getroot()
        # Get text
        text = xmldoc.findtext('TEXT')
        with codecs.open(output_text_filepath, 'w', 'UTF-8') as f:
            f.write(text)

        # Get PHI tags
        tags = xmldoc.findall('TAGS')[0] # [0] because there is only one <TAGS>...</TAGS>
        entities = []
        for tag in tags:
            entity = {}
            entity['label'] = tag.get('TYPE')
            entity['text'] = tag.get('text')
            entity['start'] = int(tag.get('start'))
            entity['end'] = int(tag.get('end'))
            entities.append(entity)
        output_entities(output_folder, filename, entities, output_text_filepath, text, overwrite=overwrite)

    time_spent = time.time() - start_time
    print("Time spent formatting: {0:.2f} seconds".format(time_spent))
Exemplo n.º 5
0
 def _create_stats_graph_folder(self, parameters):
     # Initialize stats_graph_folder
     experiment_timestamp = utils.get_current_time_in_miliseconds()
     dataset_name = utils.get_basename_without_extension(
         parameters['dataset_text_folder'])
     model_name = '{0}_{1}'.format(dataset_name, experiment_timestamp)
     utils.create_folder_if_not_exists(parameters['output_folder'])
     stats_graph_folder = os.path.join(
         parameters['output_folder'],
         model_name)  # Folder where to save graphs
     utils.create_folder_if_not_exists(stats_graph_folder)
     return stats_graph_folder, experiment_timestamp
Exemplo n.º 6
0
def check_compatibility_between_conll_and_brat_text(conll_filepath,
                                                    brat_folder):
    '''
    check if token offsets match between conll and brat .txt files. 

    conll_filepath: path to conll file
    brat_folder: folder that contains the .txt (and .ann) files that are formatted according to brat.
                                
    '''
    verbose = False
    dataset_type = utils.get_basename_without_extension(conll_filepath)
    print("Checking compatibility between CONLL and BRAT for {0} set ... ".
          format(dataset_type),
          end='')
    conll_file = codecs.open(conll_filepath, 'r', 'UTF-8')

    previous_filename = ''
    for line in conll_file:
        line = line.strip().split(' ')
        # New sentence
        if len(line) == 0 or len(line[0]) == 0 or '-DOCSTART-' in line[0]:
            continue

        filename = str(line[1])
        # New file
        if filename != previous_filename:
            text_filepath = os.path.join(brat_folder,
                                         '{0}.txt'.format(filename))
            with codecs.open(text_filepath, 'r', 'UTF-8') as f:
                text = f.read()
            previous_filename = filename

        label = str(line[-1]).replace('_', '-')  # For LOCATION-OTHER

        token = {}
        token['text'] = str(line[0])
        token['start'] = int(line[2])
        token['end'] = int(line[3])

        # check that the token text matches the original
        if token['text'] != text[token['start']:token['end']]:
            print(str(line[1]))
            print("Warning: conll and brat text do not match.")
            print("\tCONLL: {0}".format(token['text']))
            print("\tBRAT : {0}".format(text[token['start']:token['end']]))
            if token['text'] != text[token['start']:token['end']].replace(
                    ' ', '-'):
                raise AssertionError("CONLL and BRAT files are incompatible.")

    print("Done.")
Exemplo n.º 7
0
    def predict(self, text):
        """
        Predict

        Args:
            text (str): Description.
        """
        self.prediction_count += 1

        if self.prediction_count == 1:
            self.parameters['dataset_text_folder'] = os.path.join(
                '.', 'data', 'temp')
            self.stats_graph_folder, _ = self._create_stats_graph_folder(
                self.parameters)

        # Update the deploy folder, file, and modeldata
        dataset_type = 'deploy'

        # Delete all deployment data
        for filepath in glob.glob(
                os.path.join(self.parameters['dataset_text_folder'],
                             '{0}*'.format(dataset_type))):
            if os.path.isdir(filepath):
                shutil.rmtree(filepath)
            else:
                os.remove(filepath)

        # Create brat folder and file
        dataset_brat_deploy_folder = os.path.join(
            self.parameters['dataset_text_folder'], dataset_type)
        utils.create_folder_if_not_exists(dataset_brat_deploy_folder)
        dataset_brat_deploy_filepath = os.path.join(
            dataset_brat_deploy_folder,
            'temp_{0}.txt'.format(str(self.prediction_count).zfill(5)))
        #self._get_dataset_brat_deploy_filepath(dataset_brat_deploy_folder)
        with codecs.open(dataset_brat_deploy_filepath, 'w', 'UTF-8') as f:
            f.write(text)

        # Update deploy filepaths
        dataset_filepaths, dataset_brat_folders = self._get_valid_dataset_filepaths(
            self.parameters, dataset_types=[dataset_type])
        self.dataset_filepaths.update(dataset_filepaths)
        self.dataset_brat_folders.update(dataset_brat_folders)

        # Update the dataset for the new deploy set
        self.modeldata.update_dataset(self.dataset_filepaths, [dataset_type])

        # Predict labels and output brat
        output_filepaths = {}
        prediction_output = train.prediction_step(
            self.sess, self.modeldata, dataset_type, self.model,
            self.transition_params_trained, self.stats_graph_folder,
            self.prediction_count, self.parameters, self.dataset_filepaths)

        _, _, output_filepaths[dataset_type] = prediction_output
        conll_to_brat.output_brat(output_filepaths,
                                  self.dataset_brat_folders,
                                  self.stats_graph_folder,
                                  overwrite=True)

        # Print and output result
        text_filepath = os.path.join(
            self.stats_graph_folder, 'brat', 'deploy',
            os.path.basename(dataset_brat_deploy_filepath))
        annotation_filepath = os.path.join(
            self.stats_graph_folder, 'brat', 'deploy', '{0}.ann'.format(
                utils.get_basename_without_extension(
                    dataset_brat_deploy_filepath)))
        text2, entities = brat_to_conll.get_entities_from_brat(
            text_filepath, annotation_filepath, verbose=True)
        assert (text == text2)
        return entities
Exemplo n.º 8
0
    def _get_valid_dataset_filepaths(self,
                                     parameters,
                                     dataset_types=[
                                         'train', 'valid', 'test', 'deploy'
                                     ]):
        """
        Get paths for the datasets.

        Args:
            parameters (type): description.
            dataset_types (type): description.
        """
        dataset_filepaths = {}
        dataset_brat_folders = {}

        for dataset_type in dataset_types:
            dataset_filepaths[dataset_type] = os.path.join(
                parameters['dataset_text_folder'],
                '{0}.txt'.format(dataset_type))
            dataset_brat_folders[dataset_type] = os.path.join(
                parameters['dataset_text_folder'], dataset_type)
            dataset_compatible_with_brat_filepath = os.path.join(
                parameters['dataset_text_folder'],
                '{0}_compatible_with_brat.txt'.format(dataset_type))

            # Conll file exists
            if os.path.isfile(dataset_filepaths[dataset_type]) \
            and os.path.getsize(dataset_filepaths[dataset_type]) > 0:
                # Brat text files exist
                if os.path.exists(dataset_brat_folders[dataset_type]) and \
                len(glob.glob(os.path.join(dataset_brat_folders[dataset_type], '*.txt'))) > 0:

                    # Check compatibility between conll and brat files
                    brat_to_conll.check_brat_annotation_and_text_compatibility(
                        dataset_brat_folders[dataset_type])
                    if os.path.exists(dataset_compatible_with_brat_filepath):
                        dataset_filepaths[
                            dataset_type] = dataset_compatible_with_brat_filepath
                    conll_to_brat.check_compatibility_between_conll_and_brat_text(
                        dataset_filepaths[dataset_type],
                        dataset_brat_folders[dataset_type])

                # Brat text files do not exist
                else:
                    # Populate brat text and annotation files based on conll file
                    conll_to_brat.conll_to_brat(
                        dataset_filepaths[dataset_type],
                        dataset_compatible_with_brat_filepath,
                        dataset_brat_folders[dataset_type],
                        dataset_brat_folders[dataset_type])
                    dataset_filepaths[
                        dataset_type] = dataset_compatible_with_brat_filepath

            # Conll file does not exist
            else:
                # Brat text files exist
                if os.path.exists(dataset_brat_folders[dataset_type]) \
                and len(glob.glob(os.path.join(dataset_brat_folders[dataset_type], '*.txt'))) > 0:
                    dataset_filepath_for_tokenizer = os.path.join(
                        parameters['dataset_text_folder'],
                        '{0}_{1}.txt'.format(dataset_type,
                                             parameters['tokenizer']))
                    if os.path.exists(dataset_filepath_for_tokenizer):
                        conll_to_brat.check_compatibility_between_conll_and_brat_text(
                            dataset_filepath_for_tokenizer,
                            dataset_brat_folders[dataset_type])
                    else:
                        # Populate conll file based on brat files
                        brat_to_conll.brat_to_conll(
                            dataset_brat_folders[dataset_type],
                            dataset_filepath_for_tokenizer,
                            parameters['tokenizer'],
                            parameters['spacylanguage'])
                    dataset_filepaths[
                        dataset_type] = dataset_filepath_for_tokenizer

                # Brat text files do not exist
                else:
                    del dataset_filepaths[dataset_type]
                    del dataset_brat_folders[dataset_type]
                    continue

            if parameters['tagging_format'] == 'bioes':
                # Generate conll file with BIOES format
                bioes_filepath = os.path.join(
                    parameters['dataset_text_folder'], '{0}_bioes.txt'.format(
                        utils.get_basename_without_extension(
                            dataset_filepaths[dataset_type])))
                utils_nlp.convert_conll_from_bio_to_bioes(
                    dataset_filepaths[dataset_type], bioes_filepath)
                dataset_filepaths[dataset_type] = bioes_filepath

        return dataset_filepaths, dataset_brat_folders
Exemplo n.º 9
0
def conll_to_brat(conll_input_filepath,
                  conll_output_filepath,
                  brat_original_folder,
                  brat_output_folder,
                  overwrite=False):
    '''
    convert conll file in conll-filepath to brat annotations and output to brat_output_folder, 
    with reference to the existing text files in brat_original_folder 
    if brat_original_folder does not exist or contain any text file, then the text files are generated from conll files,
    and conll file is updated with filenames and token offsets accordingly. 
    
    conll_input_filepath: path to conll file to convert to brat annotations
    conll_output_filepath: path to output conll file with filename and offsets that are compatible with brat annotations
    brat_original_folder: folder that contains the original .txt (and .ann) files that are formatted according to brat.
                          .txt files are used to check if the token offsets match and generate the annotation from conll.                      
    brat_output_folder: folder to output the text and brat annotations 
                        .txt files are copied from brat_original_folder to brat_output_folder
    '''
    verbose = False
    dataset_type = utils.get_basename_without_extension(conll_input_filepath)
    print("Formatting {0} set from CONLL to BRAT... ".format(dataset_type),
          end='')

    # if brat_original_folder does not exist or have any text file
    if not os.path.exists(brat_original_folder) or len(
            glob.glob(os.path.join(brat_original_folder, '*.txt'))) == 0:
        assert (conll_input_filepath != conll_output_filepath)
        generate_reference_text_file_for_conll(conll_input_filepath,
                                               conll_output_filepath,
                                               brat_original_folder)

    utils.create_folder_if_not_exists(brat_output_folder)
    conll_file = codecs.open(conll_output_filepath, 'r', 'UTF-8')

    previous_token_label = 'O'
    previous_filename = ''
    text_filepath = ''
    text = ''
    entity_id = 1
    entities = []
    entity = {}
    for line in conll_file:
        line = line.strip().split(' ')
        # New sentence
        if len(line) == 0 or len(line[0]) == 0 or '-DOCSTART-' in line[0]:
            # Add the last entity
            if entity != {}:
                if verbose: print("entity: {0}".format(entity))
                entities.append(entity)
                entity_id += 1
                entity = {}
            previous_token_label = 'O'
            continue

        filename = str(line[1])
        # New file
        if filename != previous_filename:
            output_entities(brat_output_folder,
                            previous_filename,
                            entities,
                            text_filepath,
                            text,
                            overwrite=overwrite)
            text_filepath = os.path.join(brat_original_folder,
                                         '{0}.txt'.format(filename))
            with codecs.open(text_filepath, 'r', 'UTF-8') as f:
                text = f.read()
            previous_token_label = 'O'
            previous_filename = filename
            entity_id = 1
            entities = []
            entity = {}

        label = str(line[-1]).replace('_', '-')  # For LOCATION-OTHER
        if label == 'O':
            # Previous entity ended
            if previous_token_label != 'O':
                if verbose: print("entity: {0}".format(entity))
                entities.append(entity)
                entity_id += 1
                entity = {}
            previous_token_label = 'O'
            continue

        token = {}
        token['text'] = str(line[0])
        token['start'] = int(line[2])
        token['end'] = int(line[3])
        # check that the token text matches the original
        if token['text'] != text[token['start']:token['end']].replace(
                ' ', '-'):
            print("Warning: conll and brat text do not match.")
            print("\tCONLL: {0}".format(token['text']))
            print("\tBRAT : {0}".format(text[token['start']:token['end']]))
        token['label'] = label[2:]

        if label[:2] == 'B-':
            if previous_token_label != 'O':
                # End the previous entity
                if verbose: print("entity: {0}".format(entity))
                entities.append(entity)
                entity_id += 1
            # Start a new entity
            entity = token
        elif label[:2] == 'I-':
            # Entity continued
            if previous_token_label == token['label']:
                # if there is no newline between the entity and the token
                if '\n' not in text[entity['end']:token['start']]:
                    # Update entity
                    entity['text'] = entity['text'] + ' ' + token['text']
                    entity['end'] = token['end']
                else:  # newline between the entity and the token
                    # End the previous entity
                    if verbose: print("entity: {0}".format(entity))
                    entities.append(entity)
                    entity_id += 1
                    # Start a new entity
                    entity = token
            elif previous_token_label != 'O':
                # TODO: count BI or II incompatibility
                # End the previous entity
                if verbose: print("entity: {0}".format(entity))
                entities.append(entity)
                entity_id += 1
                # Start new entity
                entity = token
            else:  # previous_token_label == 'O'
                # TODO: count  OI incompatibility
                # Start new entity
                entity = token
        previous_token_label = token['label']
    output_entities(brat_output_folder,
                    previous_filename,
                    entities,
                    text_filepath,
                    text,
                    overwrite=overwrite)
    conll_file.close()
    print('Done.')