示例#1
0
    def __init__(self, note_path, overwrite=False):

        print "processing file...", note_path
        self.source_file = note_path
        self.overwrite = overwrite

        _Note = Note.__init__(self, note_path, note_path)

        self.raw_text = []
        self.text = []
        self.relations = []

        # read data, save text and relations
        self.load_data()
        print "data loaded"

        # send body of document to NewsReader pipeline.
        tokenized_text, token_to_offset, sentence_features, dependency_paths, id_to_tok = \
            pre_processing.pre_process('\n'.join(self.text), note_path, overwrite=self.overwrite)

        # chunks = self.split_text(200)
        # for i, chunk in enumerate(chunks):
        #     tokenized_text, token_to_offset, sentence_features, dependency_paths, id_to_tok = \
        #         pre_processing.pre_process('\n'.join(self.text), note_path+str(i), overwrite=self.overwrite)
        #
        #     self.pre_processed_text.update(tokenized_text)
        #     self.token_to_offset.update(token_to_offset)
        #     self.sentence_features.update(sentence_features)
        #     self.dependency_paths.update(dependency_paths)
        #     self.id_to_tok.update(id_to_tok)

        # {sentence_num: [{token},...], ...}
        self.pre_processed_text = tokenized_text

        # contains the char based offsets generated by tokenizer, used for asserting char offsets are correct
        # {'token':[(start, end),...],...}
        self.token_to_offset = token_to_offset

        # contains sentence level information extracted by newsreader
        self.sentence_features = sentence_features

        # dependency paths for sentences in the document
        self.dependency_paths = dependency_paths

        # map token ids to tokens within self.tokenized_text
        # {'wid':'token'}
        self.id_to_tok = id_to_tok

        self.discourse_connectives = {}

        self.iob_labels = []
        self.semLinks = []

        # get list of [{'entity_id': 10002, 'entity_label': 'Component-Whole(e2,e1)', 'entity_type': 'e2'}, ...]
        self.get_labels()
        """
示例#2
0
    def __init__(self, string, fret, interval=c.INTVL_ROOT):
        self.set_string(string)
        self.set_fret(fret)

        note = string.get_note_at_fret(fret)
        Note.__init__(self, note.get_name(), note.get_accidental(), interval)
    def __init__(self, timeml_note_path, annotated_timeml_path=None, verbose=False):

        if verbose: print "called TimeNote constructor"

        _Note = Note.__init__(self, timeml_note_path, annotated_timeml_path)

        # get body of document
        data = get_text(timeml_note_path)

        # original text body of timeml doc
        self.original_text = data

        # send body of document to NewsReader pipeline.
        tokenized_text, token_to_offset, sentence_features, dependency_paths, id_to_tok = pre_processing.pre_process(data, timeml_note_path)

        # {sentence_num: [{token},...], ...}
        self.pre_processed_text = tokenized_text

        # contains the char based offsets generated by tokenizer, used for asserting char offsets are correct
        # {'token':[(start, end),...],...}
        self.token_to_offset = token_to_offset

        # contains sentence level information extracted by newsreader
        self.sentence_features = sentence_features

        # dependency paths for sentences in the document
        self.dependency_paths = dependency_paths

        # map token ids to tokens within self.tokenized_text
        # {'wid':'token'}
        self.id_to_tok = id_to_tok

        self.discourse_connectives = {}

        self.iob_labels = []

        """
        print "\n\nself.original_text:\n\n"
        print self.original_text
        print "\n\n"

        print "self.pre_processed_text:\n\n"
        print tokenized_text
        print "\n\n"

        print "self.token_to_offset:\n\n"
        print self.token_to_offset
        print "\n\n"

        print "self.sentence_features:\n\n"
        print self.sentence_features
        print "\n\n"
        """

        self.tlinks = []

        if self.annotated_note_path is not None:

            self.get_tlinked_entities()

            # will store labels in self.iob_labels
            self.get_labels()