def train(self, documents, model): """ It returns a RelationModel object. """ # TO-DO: feature extractor deve yieldare anziche' ritornare assert type(documents) == list, 'Wrong type for documents.' assert len(documents) > 0, 'Empty documents list.' path_model_attribute = (PATH_MODEL_FOLDER, model.name) trainingset_path = '{}/{}/relation.trainingset.TLINK'.format( *path_model_attribute) header = relation_matrix(documents, trainingset_path, training=True) model.load_relation_header(header) model_path = '{}'.format(model.path_relation) crf_command = [ PATH_CRF_PP_ENGINE_TRAIN, '-p', str(self.num_cores), model.path_relation_topology, trainingset_path, model_path ] with Mute_stderr(): process = subprocess.Popen(crf_command, stdout=subprocess.PIPE) _, _ = process.communicate() # Weakly check the output models if not os.path.isfile(model_path): logging.error('Temporal relation model: *not* trained.') else: logging.error('Temporal relation model: trained.') return model
def test(self, documents, model): """ It returns a List of <Document> (with .predicted_annotations filled in.) """ logging.info('Temporal relation extraction: applying ML models.') testset_path = NamedTemporaryFile(delete=False).name relation_matrix(documents, testset_path, training=False) crf_command = [ PATH_CRF_PP_ENGINE_TEST, '-m', model.path_relation, testset_path ] # Weakly check the input files if not os.path.isfile(model.path_relation): logging.warning('Model doesn\'t exist at {}'.format( model.path_relation)) return documents else: if os.stat(model.path_relation).st_size == 0: logging.warning('Relation model is empty!') return documents if not os.path.isfile(testset_path): msg = 'Temporal relation test set doesn\'t exist at {}.' logging.error(msg.format(testset_path)) return documents with Mute_stderr(): process = subprocess.Popen(crf_command, stdout=subprocess.PIPE, stderr=None, stdin=None) tlink_counter = 0 for line in iter(process.stdout.readline, ''): line = line.strip() if line: line = line.split('\t') relation_type = line[-1].strip() if relation_type != 'O': n_doc, from_id, to_id = line[-2].split('_') n_doc = int(n_doc) annotations = documents[n_doc].predicted_annotations tlink_id = 'TL{}'.format(tlink_counter) from_obj = annotations[from_id] to_obj = annotations[to_id] annotations[tlink_id] = TemporalLink( tlink_id, from_obj, to_obj, relation_type) tlink_counter += 1 # close stdout process.stdout.close() process.wait() # delete testset os.remove(testset_path) logging.info('Temporal relation extraction: done.') return documents
def train(self, documents, model_name): """It returns a ClassificationModel object. """ # TO-DO: feature extractor deve yieldare anziche' ritornare assert type(documents) == list, 'Wrong type for documents.' assert len(documents) > 0, 'Empty documents list.' model = ClassificationModel(model_name) # load the header into the model first_word = documents[0].sentences[0].words[0] header = [k for k, _ in sorted(first_word.attributes.items())] model.load_header(header) # search for the token_normalised attribute position token_normalised_pos = [ p for p, a in enumerate(header) if a.find('token_normalised') > -1 ][0] model.pp_pipeline_attribute_pos = token_normalised_pos # save trainingset to model_name.trainingset.class scaling_factors = {} for idnt_class in ('EVENT', 'TIMEX'): path_and_model = (PATH_MODEL_FOLDER, model.name, idnt_class) trainingset_path = '{}/{}/identification.trainingset.{}'.format( *path_and_model) identification_attribute_matrix(documents, trainingset_path, idnt_class) # save scale factors for post processing pipeline scaling_factors[idnt_class] = get_scale_factors( trainingset_path, token_normalised_pos) crf_command = [ PATH_CRF_PP_ENGINE_TRAIN, '-p', str(self.num_cores), model.path_topology, trainingset_path, '{}.{}'.format(model.path, idnt_class) ] with Mute_stderr(): process = subprocess.Popen(crf_command, stdout=subprocess.PIPE) _, _ = process.communicate() # TO-DO: Check if the script saves a model or returns an error logging.info( 'Identification CRF model ({}): trained.'.format(idnt_class)) # save factors in the model model.load_scaling_factors(scaling_factors) return model
def train(self, documents, model): """It returns a ClassificationModel object for event CLASS attributes. """ # TO-DO: feature extractor deve yieldare anziche' ritornare assert type(documents) == list, 'Wrong type for documents.' assert len(documents) > 0, 'Empty documents list.' # save trainingset to model_name.trainingset.*attribute* for attribute in self.attributes: path_model_attribute = (PATH_MODEL_FOLDER, model.name, attribute) trainingset_path = '{}/{}/normalisation.trainingset.{}'.format( *path_model_attribute) normalisation_attribute_matrix(documents, trainingset_path, attribute, training=True) model_path = '{}.{}'.format(model.path_normalisation, attribute) crf_command = [ PATH_CRF_PP_ENGINE_TRAIN, '-p', str(self.num_cores), model.path_attribute_topology, trainingset_path, model_path ] with Mute_stderr(): process = subprocess.Popen(crf_command, stdout=subprocess.PIPE) _, _ = process.communicate() # Weakly check the output models if not os.path.isfile(model_path): msg = 'Normalisation CRF model ({}): *not* trained.' logging.error(msg.format(attribute)) else: msg = 'Normalisation CRF model ({}): trained.' logging.info(msg.format(attribute)) return model
def test(self, documents, model, domain='general'): """It returns the sequence of labels from the classifier. It returns the same data structure (list of documents, of sentences, of words with the right labels. """ logging.info('Normalisation: applying ML models.') for attribute in self.attributes: testset_path = NamedTemporaryFile(delete=False).name model_path = '{}.{}'.format(model.path_normalisation, attribute) normalisation_attribute_matrix(documents, testset_path, attribute, training=False) crf_command = [ PATH_CRF_PP_ENGINE_TEST, '-m', model_path, testset_path ] # Weakly check the input files if not os.path.isfile(model_path): logging.warning( 'Model doesn\'t exist at {}'.format(model_path)) continue else: if os.stat(model_path).st_size == 0: msg = 'Normalisation model for {} is empty!' logging.warning(msg.format(attribute.lower())) continue if not os.path.isfile(testset_path): msg = 'Normalisation test set for {} doesn\'t exist at {}.' logging.error(msg.format(attribute.lower(), testset_path)) continue with Mute_stderr(): process = subprocess.Popen(crf_command, stdout=subprocess.PIPE, stderr=None, stdin=None) for line in iter(process.stdout.readline, ''): line = line.strip() if line: line = line.split('\t') label = line[-1] location = line[-2] seq_label = SequenceLabel(line[-3]) if seq_label.is_event(): n_doc, n_sent, n_word = location.split('_') documents[int(n_doc)]\ .sentences[int(n_sent)].words[int(n_word)]\ .tag_attributes[attribute] = label # close stdout process.stdout.close() process.wait() # delete testset os.remove(testset_path) # normalisation of temporal expressions and events for document in documents: for element in document.predicted_annotations.itervalues(): if isinstance(element, Event): element.normalise(document) elif isinstance(element, TemporalExpression): utterance = document.dct.replace('-', '') if domain == 'general': element.normalise(document, utterance) elif domain == 'clinical': element.normalise(document, utterance, 'clinical') logging.info('Normalisation: done.') return documents
def test(self, documents, model, post_processing_pipeline=False): """It returns the sequence of labels from the CRF classifier. It returns the same data structure (list of documents, of sentences, of words with the right labels. """ logging.info('Identification: applying ML models.') if extractors_stamp() != model.extractors_md5: logging.warning('The feature extractor component is different ' + 'from the one used in the training!') if post_processing_pipeline: try: factors = cPickle.load(open(model.path_factors)) logging.info('Scale factors loaded.') except IOError: post_processing_pipeline = False logging.warning('Scale factors not found.') for idnt_class in ('EVENT', 'TIMEX'): testset_path = NamedTemporaryFile(delete=False).name model_path = '{}.{}'.format(model.path, idnt_class) identification_attribute_matrix(documents, testset_path, idnt_class, training=False) if post_processing_pipeline: crf_command = [ PATH_CRF_PP_ENGINE_TEST, '-v2', '-m', model_path, testset_path ] else: crf_command = [ PATH_CRF_PP_ENGINE_TEST, '-m', model_path, testset_path ] # Draconianly check the input files assert os.path.isfile(model_path), 'Model not found!' assert os.stat(model_path).st_size > 0, 'Model is empty!' assert os.path.isfile(testset_path), 'Test set doesn\'t exist!' with Mute_stderr(): process = subprocess.Popen(crf_command, stdout=subprocess.PIPE) n_doc, n_sent, n_word = 0, 0, 0 # post-processing pipeline if post_processing_pipeline and factors: scale_factors = factors[idnt_class] lines = label_switcher( probabilistic_correction(iter(process.stdout.readline, ''), scale_factors, model.pp_pipeline_attribute_pos, model.num_of_features, .5), scale_factors, model.pp_pipeline_attribute_pos, .87) else: lines = iter(process.stdout.readline, '') prev_element = None prev_label = SequenceLabel('O') n_timex, n_event = 1, 1 for line in lines: line = line.strip() if line: # read the predicted label (last column from CRF++) predicted_class = line.split('\t')[-1] curr_label = SequenceLabel(predicted_class) # for events, the predicted label carries the event # class and not just [IO]-EVENT. Therefore, I need to # save the class in eclass variable and also change # curr_label's tag to just 'EVENT' if idnt_class == 'EVENT': if not curr_label.is_out(): try: eclass = curr_label.tag curr_label.tag = 'EVENT' except AttributeError: curr_label.set_out() curr_word = documents[n_doc].sentences[n_sent].words[ n_word] # Just consider not annotated the current word if it has # been already positively annotated by another previous # model. Notice that the order in the most general FOR loop # of this script has an impact. if not curr_word.predicted_label.is_out(): curr_label.set_out() if curr_label != prev_label: if prev_element: documents[n_doc].predicted_annotations[ prev_element.identifier()] = prev_element if curr_label.is_event(): prev_element = Event('e{}'.format(n_event), [curr_word], eclass=eclass) n_event += 1 elif curr_label.is_timex(): prev_element = TemporalExpression( 't{}'.format(n_timex), [curr_word]) n_timex += 1 else: prev_element = None else: if not curr_label.is_out(): prev_element.append_word(curr_word) if not curr_label.is_out(): curr_word.predicted_label = curr_label prev_label = curr_label n_word += 1 if len(documents[n_doc].sentences[n_sent].words) == n_word: n_word = 0 n_sent += 1 if len(documents[n_doc].sentences) == n_sent: n_word, n_sent = 0, 0 n_doc += 1 # this is the sentence separator. the eventual annotation is # pushed into the document. This prevents the merging of an # annotation at the end of a sentence and at the beginning of a # new one. else: if prev_element: try: documents[n_doc].predicted_annotations[ prev_element.identifier()] = prev_element except IndexError: # we are at the end of the document and n_doc has # been already incremented. we need to add # prev_element to the previous document. documents[n_doc - 1].predicted_annotations[ prev_element.identifier()] = prev_element logging.info('Identification: done.') return documents
def parse(self, file_path): """It parses the content of file_path and extracts relevant information from a TempEval-3 annotated file. Those information are packed in a Document object, which is our internal representation. """ assert os.path.isfile(file_path), 'File path does not exist!' logging.info('Document {}: parsing...'.format( os.path.relpath(file_path))) xml = etree.parse(file_path) text_node = xml.findall(".//TEXT")[0] text_string = etree.tostring(text_node, method='text', encoding='utf8') text_xml = etree.tostring(text_node, method='xml', encoding='utf8') text_string = unicode(text_string, 'UTF-8') text_xml = unicode(text_xml, 'UTF-8') right_chars = len(text_xml.split('</TEXT>')[1]) text_string = text_string[:-right_chars] text_xml = etree.tostring(text_node) # StanfordParser strips internally the text :( left_chars = len(text_string) - len(text_string.lstrip()) with Mute_stderr(): stanford_tree = CORENLP.parse(text_string) document = Document(file_path) document.text_offset = left_chars document.file_path = os.path.abspath(file_path) document.doc_id = os.path.basename(file_path) document.sec_times = self.get_dct(file_path) document.dct = document.sec_times.admission_date document.dct_text = document.dct.replace('-', '') document.title = os.path.basename(file_path) document.text = text_string document._coref = stanford_tree.get('coref', []) for num_sen, stanford_sentence in\ enumerate(stanford_tree['sentences']): collp_deps = stanford_sentence.get('collapsed_dependencies', None) basic_deps = stanford_sentence.get('basic_dependencies', None) parsetree = stanford_sentence.get('parsetree', u'') sentence_text = stanford_sentence.get('text', u'') sentence = Sentence(id_sentence=num_sen, basic_dependencies=basic_deps, collapsed_dependencies=collp_deps, parsetree=parsetree, text=sentence_text) for num_word, (word_form, attr) in\ enumerate(stanford_sentence['words']): offset_begin = int(attr['CharacterOffsetBegin']) - left_chars offset_end = int(attr['CharacterOffsetEnd']) - left_chars word = Word(word_form=word_form, char_offset_begin=offset_begin, char_offset_end=offset_end, lemma=attr['Lemma'], named_entity_tag=attr['NamedEntityTag'], part_of_speech=attr['PartOfSpeech'], id_token=num_word, id_sentence=num_sen) sentence.words.append(word) document.sentences.append(sentence) document.gold_annotations = self._get_annotations(xml, document) document.store_gold_annotations() document.complete_structure() logging.info('Document {}: parsed.'.format(os.path.relpath(file_path))) return document
''' import tempfile from corenlp import batch_parse dirname = tempfile.mkdtemp() with tempfile.NamedTemporaryFile('w', dir=dirname, delete=False) as f: filename = f.name with codecs.open(filename, 'w', encoding='utf8') as tmp: tmp.write(text) tmp.flush() result = batch_parse(os.path.dirname(tmp.name), self.folder) result = list(result)[0] cPickle.dump(result, open(dest_file, 'w')) return result with Mute_stderr(): CORENLP = BatchedCoreNLP(PATH_CORENLP_FOLDER) class Reader(object): """This class is an abstract reader for ManTIME.""" __metaclass__ = ABCMeta @abstractmethod def parse(self, text): pass class TextReader(Reader): '''Handles textual input.