示例#1
0
    def train(self, documents, model):
        """ It returns a RelationModel object.

        """
        # TO-DO: feature extractor deve yieldare anziche' ritornare
        assert type(documents) == list, 'Wrong type for documents.'
        assert len(documents) > 0, 'Empty documents list.'

        path_model_attribute = (PATH_MODEL_FOLDER, model.name)
        trainingset_path = '{}/{}/relation.trainingset.TLINK'.format(
            *path_model_attribute)
        header = relation_matrix(documents, trainingset_path, training=True)
        model.load_relation_header(header)
        model_path = '{}'.format(model.path_relation)
        crf_command = [
            PATH_CRF_PP_ENGINE_TRAIN, '-p',
            str(self.num_cores), model.path_relation_topology,
            trainingset_path, model_path
        ]

        with Mute_stderr():
            process = subprocess.Popen(crf_command, stdout=subprocess.PIPE)
            _, _ = process.communicate()

        # Weakly check the output models
        if not os.path.isfile(model_path):
            logging.error('Temporal relation model: *not* trained.')
        else:
            logging.error('Temporal relation model: trained.')
        return model
示例#2
0
    def test(self, documents, model):
        """ It returns a List of <Document> (with .predicted_annotations filled
            in.)

        """
        logging.info('Temporal relation extraction: applying ML models.')
        testset_path = NamedTemporaryFile(delete=False).name
        relation_matrix(documents, testset_path, training=False)
        crf_command = [
            PATH_CRF_PP_ENGINE_TEST, '-m', model.path_relation, testset_path
        ]

        # Weakly check the input files
        if not os.path.isfile(model.path_relation):
            logging.warning('Model doesn\'t exist at {}'.format(
                model.path_relation))
            return documents
        else:
            if os.stat(model.path_relation).st_size == 0:
                logging.warning('Relation model is empty!')
                return documents
        if not os.path.isfile(testset_path):
            msg = 'Temporal relation test set doesn\'t exist at {}.'
            logging.error(msg.format(testset_path))
            return documents

        with Mute_stderr():
            process = subprocess.Popen(crf_command,
                                       stdout=subprocess.PIPE,
                                       stderr=None,
                                       stdin=None)

            tlink_counter = 0
            for line in iter(process.stdout.readline, ''):
                line = line.strip()
                if line:
                    line = line.split('\t')
                    relation_type = line[-1].strip()
                    if relation_type != 'O':
                        n_doc, from_id, to_id = line[-2].split('_')
                        n_doc = int(n_doc)
                        annotations = documents[n_doc].predicted_annotations
                        tlink_id = 'TL{}'.format(tlink_counter)
                        from_obj = annotations[from_id]
                        to_obj = annotations[to_id]
                        annotations[tlink_id] = TemporalLink(
                            tlink_id, from_obj, to_obj, relation_type)
                        tlink_counter += 1
            # close stdout
            process.stdout.close()
            process.wait()

        # delete testset
        os.remove(testset_path)

        logging.info('Temporal relation extraction: done.')
        return documents
示例#3
0
    def train(self, documents, model_name):
        """It returns a ClassificationModel object.

        """
        # TO-DO: feature extractor deve yieldare anziche' ritornare
        assert type(documents) == list, 'Wrong type for documents.'
        assert len(documents) > 0, 'Empty documents list.'

        model = ClassificationModel(model_name)

        # load the header into the model
        first_word = documents[0].sentences[0].words[0]
        header = [k for k, _ in sorted(first_word.attributes.items())]
        model.load_header(header)

        # search for the token_normalised attribute position
        token_normalised_pos = [
            p for p, a in enumerate(header) if a.find('token_normalised') > -1
        ][0]
        model.pp_pipeline_attribute_pos = token_normalised_pos

        # save trainingset to model_name.trainingset.class
        scaling_factors = {}
        for idnt_class in ('EVENT', 'TIMEX'):
            path_and_model = (PATH_MODEL_FOLDER, model.name, idnt_class)
            trainingset_path = '{}/{}/identification.trainingset.{}'.format(
                *path_and_model)
            identification_attribute_matrix(documents, trainingset_path,
                                            idnt_class)

            # save scale factors for post processing pipeline
            scaling_factors[idnt_class] = get_scale_factors(
                trainingset_path, token_normalised_pos)

            crf_command = [
                PATH_CRF_PP_ENGINE_TRAIN, '-p',
                str(self.num_cores), model.path_topology, trainingset_path,
                '{}.{}'.format(model.path, idnt_class)
            ]
            with Mute_stderr():
                process = subprocess.Popen(crf_command, stdout=subprocess.PIPE)
                _, _ = process.communicate()

            # TO-DO: Check if the script saves a model or returns an error
            logging.info(
                'Identification CRF model ({}): trained.'.format(idnt_class))

        # save factors in the model
        model.load_scaling_factors(scaling_factors)

        return model
示例#4
0
    def train(self, documents, model):
        """It returns a ClassificationModel object for event CLASS attributes.

        """
        # TO-DO: feature extractor deve yieldare anziche' ritornare
        assert type(documents) == list, 'Wrong type for documents.'
        assert len(documents) > 0, 'Empty documents list.'

        # save trainingset to model_name.trainingset.*attribute*
        for attribute in self.attributes:
            path_model_attribute = (PATH_MODEL_FOLDER, model.name, attribute)
            trainingset_path = '{}/{}/normalisation.trainingset.{}'.format(
                *path_model_attribute)
            normalisation_attribute_matrix(documents,
                                           trainingset_path,
                                           attribute,
                                           training=True)
            model_path = '{}.{}'.format(model.path_normalisation, attribute)
            crf_command = [
                PATH_CRF_PP_ENGINE_TRAIN, '-p',
                str(self.num_cores), model.path_attribute_topology,
                trainingset_path, model_path
            ]

            with Mute_stderr():
                process = subprocess.Popen(crf_command, stdout=subprocess.PIPE)
                _, _ = process.communicate()

            # Weakly check the output models
            if not os.path.isfile(model_path):
                msg = 'Normalisation CRF model ({}): *not* trained.'
                logging.error(msg.format(attribute))
            else:
                msg = 'Normalisation CRF model ({}): trained.'
                logging.info(msg.format(attribute))
        return model
示例#5
0
    def test(self, documents, model, domain='general'):
        """It returns the sequence of labels from the classifier.

        It returns the same data structure (list of documents, of sentences,
        of words with the right labels.

        """
        logging.info('Normalisation: applying ML models.')
        for attribute in self.attributes:
            testset_path = NamedTemporaryFile(delete=False).name
            model_path = '{}.{}'.format(model.path_normalisation, attribute)
            normalisation_attribute_matrix(documents,
                                           testset_path,
                                           attribute,
                                           training=False)
            crf_command = [
                PATH_CRF_PP_ENGINE_TEST, '-m', model_path, testset_path
            ]

            # Weakly check the input files
            if not os.path.isfile(model_path):
                logging.warning(
                    'Model doesn\'t exist at {}'.format(model_path))
                continue
            else:
                if os.stat(model_path).st_size == 0:
                    msg = 'Normalisation model for {} is empty!'
                    logging.warning(msg.format(attribute.lower()))
                    continue
            if not os.path.isfile(testset_path):
                msg = 'Normalisation test set for {} doesn\'t exist at {}.'
                logging.error(msg.format(attribute.lower(), testset_path))
                continue

            with Mute_stderr():
                process = subprocess.Popen(crf_command,
                                           stdout=subprocess.PIPE,
                                           stderr=None,
                                           stdin=None)

                for line in iter(process.stdout.readline, ''):
                    line = line.strip()
                    if line:
                        line = line.split('\t')
                        label = line[-1]
                        location = line[-2]
                        seq_label = SequenceLabel(line[-3])
                        if seq_label.is_event():
                            n_doc, n_sent, n_word = location.split('_')
                            documents[int(n_doc)]\
                                .sentences[int(n_sent)].words[int(n_word)]\
                                .tag_attributes[attribute] = label

                # close stdout
                process.stdout.close()
                process.wait()

            # delete testset
            os.remove(testset_path)

        # normalisation of temporal expressions and events
        for document in documents:
            for element in document.predicted_annotations.itervalues():
                if isinstance(element, Event):
                    element.normalise(document)
                elif isinstance(element, TemporalExpression):
                    utterance = document.dct.replace('-', '')
                    if domain == 'general':
                        element.normalise(document, utterance)
                    elif domain == 'clinical':
                        element.normalise(document, utterance, 'clinical')

        logging.info('Normalisation: done.')
        return documents
示例#6
0
    def test(self, documents, model, post_processing_pipeline=False):
        """It returns the sequence of labels from the CRF classifier.

        It returns the same data structure (list of documents, of sentences,
        of words with the right labels.

        """
        logging.info('Identification: applying ML models.')
        if extractors_stamp() != model.extractors_md5:
            logging.warning('The feature extractor component is different ' +
                            'from the one used in the training!')

        if post_processing_pipeline:
            try:
                factors = cPickle.load(open(model.path_factors))
                logging.info('Scale factors loaded.')
            except IOError:
                post_processing_pipeline = False
                logging.warning('Scale factors not found.')

        for idnt_class in ('EVENT', 'TIMEX'):
            testset_path = NamedTemporaryFile(delete=False).name
            model_path = '{}.{}'.format(model.path, idnt_class)
            identification_attribute_matrix(documents,
                                            testset_path,
                                            idnt_class,
                                            training=False)
            if post_processing_pipeline:
                crf_command = [
                    PATH_CRF_PP_ENGINE_TEST, '-v2', '-m', model_path,
                    testset_path
                ]
            else:
                crf_command = [
                    PATH_CRF_PP_ENGINE_TEST, '-m', model_path, testset_path
                ]

            # Draconianly check the input files
            assert os.path.isfile(model_path), 'Model not found!'
            assert os.stat(model_path).st_size > 0, 'Model is empty!'
            assert os.path.isfile(testset_path), 'Test set doesn\'t exist!'

            with Mute_stderr():
                process = subprocess.Popen(crf_command, stdout=subprocess.PIPE)

            n_doc, n_sent, n_word = 0, 0, 0

            # post-processing pipeline
            if post_processing_pipeline and factors:
                scale_factors = factors[idnt_class]
                lines = label_switcher(
                    probabilistic_correction(iter(process.stdout.readline,
                                                  ''), scale_factors,
                                             model.pp_pipeline_attribute_pos,
                                             model.num_of_features, .5),
                    scale_factors, model.pp_pipeline_attribute_pos, .87)
            else:
                lines = iter(process.stdout.readline, '')

            prev_element = None
            prev_label = SequenceLabel('O')
            n_timex, n_event = 1, 1
            for line in lines:
                line = line.strip()
                if line:
                    # read the predicted label (last column from CRF++)
                    predicted_class = line.split('\t')[-1]
                    curr_label = SequenceLabel(predicted_class)
                    # for events, the predicted label carries the event
                    # class and not just [IO]-EVENT. Therefore, I need to
                    # save the class in eclass variable and also change
                    # curr_label's tag to just 'EVENT'
                    if idnt_class == 'EVENT':
                        if not curr_label.is_out():
                            try:
                                eclass = curr_label.tag
                                curr_label.tag = 'EVENT'
                            except AttributeError:
                                curr_label.set_out()

                    curr_word = documents[n_doc].sentences[n_sent].words[
                        n_word]

                    # Just consider not annotated the current word if it has
                    # been already positively annotated by another previous
                    # model. Notice that the order in the most general FOR loop
                    # of this script has an impact.

                    if not curr_word.predicted_label.is_out():
                        curr_label.set_out()

                    if curr_label != prev_label:
                        if prev_element:
                            documents[n_doc].predicted_annotations[
                                prev_element.identifier()] = prev_element
                        if curr_label.is_event():
                            prev_element = Event('e{}'.format(n_event),
                                                 [curr_word],
                                                 eclass=eclass)
                            n_event += 1
                        elif curr_label.is_timex():
                            prev_element = TemporalExpression(
                                't{}'.format(n_timex), [curr_word])
                            n_timex += 1
                        else:
                            prev_element = None
                    else:
                        if not curr_label.is_out():
                            prev_element.append_word(curr_word)

                    if not curr_label.is_out():
                        curr_word.predicted_label = curr_label

                    prev_label = curr_label

                    n_word += 1

                    if len(documents[n_doc].sentences[n_sent].words) == n_word:
                        n_word = 0
                        n_sent += 1
                        if len(documents[n_doc].sentences) == n_sent:
                            n_word, n_sent = 0, 0
                            n_doc += 1

                # this is the sentence separator. the eventual annotation is
                # pushed into the document. This prevents the merging of an
                # annotation at the end of a sentence and at the beginning of a
                # new one.
                else:
                    if prev_element:
                        try:
                            documents[n_doc].predicted_annotations[
                                prev_element.identifier()] = prev_element
                        except IndexError:
                            # we are at the end of the document and n_doc has
                            # been already incremented. we need to add
                            # prev_element to the previous document.
                            documents[n_doc - 1].predicted_annotations[
                                prev_element.identifier()] = prev_element

        logging.info('Identification: done.')
        return documents
示例#7
0
    def parse(self, file_path):
        """It parses the content of file_path and extracts relevant information
        from a TempEval-3 annotated file. Those information are packed in a
        Document object, which is our internal representation.
        """
        assert os.path.isfile(file_path), 'File path does not exist!'
        logging.info('Document {}: parsing...'.format(
            os.path.relpath(file_path)))
        xml = etree.parse(file_path)
        text_node = xml.findall(".//TEXT")[0]
        text_string = etree.tostring(text_node, method='text', encoding='utf8')
        text_xml = etree.tostring(text_node, method='xml', encoding='utf8')
        text_string = unicode(text_string, 'UTF-8')
        text_xml = unicode(text_xml, 'UTF-8')
        right_chars = len(text_xml.split('</TEXT>')[1])
        text_string = text_string[:-right_chars]
        text_xml = etree.tostring(text_node)

        # StanfordParser strips internally the text :(
        left_chars = len(text_string) - len(text_string.lstrip())
        with Mute_stderr():
            stanford_tree = CORENLP.parse(text_string)

        document = Document(file_path)
        document.text_offset = left_chars
        document.file_path = os.path.abspath(file_path)
        document.doc_id = os.path.basename(file_path)
        document.sec_times = self.get_dct(file_path)
        document.dct = document.sec_times.admission_date
        document.dct_text = document.dct.replace('-', '')
        document.title = os.path.basename(file_path)
        document.text = text_string
        document._coref = stanford_tree.get('coref', [])

        for num_sen, stanford_sentence in\
                enumerate(stanford_tree['sentences']):
            collp_deps = stanford_sentence.get('collapsed_dependencies', None)
            basic_deps = stanford_sentence.get('basic_dependencies', None)
            parsetree = stanford_sentence.get('parsetree', u'')

            sentence_text = stanford_sentence.get('text', u'')

            sentence = Sentence(id_sentence=num_sen,
                                basic_dependencies=basic_deps,
                                collapsed_dependencies=collp_deps,
                                parsetree=parsetree,
                                text=sentence_text)
            for num_word, (word_form, attr) in\
                    enumerate(stanford_sentence['words']):
                offset_begin = int(attr['CharacterOffsetBegin']) - left_chars
                offset_end = int(attr['CharacterOffsetEnd']) - left_chars
                word = Word(word_form=word_form,
                            char_offset_begin=offset_begin,
                            char_offset_end=offset_end,
                            lemma=attr['Lemma'],
                            named_entity_tag=attr['NamedEntityTag'],
                            part_of_speech=attr['PartOfSpeech'],
                            id_token=num_word,
                            id_sentence=num_sen)
                sentence.words.append(word)
            document.sentences.append(sentence)

        document.gold_annotations = self._get_annotations(xml, document)
        document.store_gold_annotations()
        document.complete_structure()

        logging.info('Document {}: parsed.'.format(os.path.relpath(file_path)))
        return document
示例#8
0
        '''
        import tempfile
        from corenlp import batch_parse
        dirname = tempfile.mkdtemp()
        with tempfile.NamedTemporaryFile('w', dir=dirname, delete=False) as f:
            filename = f.name
        with codecs.open(filename, 'w', encoding='utf8') as tmp:
            tmp.write(text)
            tmp.flush()
            result = batch_parse(os.path.dirname(tmp.name), self.folder)
            result = list(result)[0]
        cPickle.dump(result, open(dest_file, 'w'))
        return result


with Mute_stderr():
    CORENLP = BatchedCoreNLP(PATH_CORENLP_FOLDER)


class Reader(object):
    """This class is an abstract reader for ManTIME."""
    __metaclass__ = ABCMeta

    @abstractmethod
    def parse(self, text):
        pass


class TextReader(Reader):
    '''Handles textual input.