def __init__(self, note_path, overwrite=False): print "processing file...", note_path self.source_file = note_path self.overwrite = overwrite _Note = Note.__init__(self, note_path, note_path) self.raw_text = [] self.text = [] self.relations = [] # read data, save text and relations self.load_data() print "data loaded" # send body of document to NewsReader pipeline. tokenized_text, token_to_offset, sentence_features, dependency_paths, id_to_tok = \ pre_processing.pre_process('\n'.join(self.text), note_path, overwrite=self.overwrite) # chunks = self.split_text(200) # for i, chunk in enumerate(chunks): # tokenized_text, token_to_offset, sentence_features, dependency_paths, id_to_tok = \ # pre_processing.pre_process('\n'.join(self.text), note_path+str(i), overwrite=self.overwrite) # # self.pre_processed_text.update(tokenized_text) # self.token_to_offset.update(token_to_offset) # self.sentence_features.update(sentence_features) # self.dependency_paths.update(dependency_paths) # self.id_to_tok.update(id_to_tok) # {sentence_num: [{token},...], ...} self.pre_processed_text = tokenized_text # contains the char based offsets generated by tokenizer, used for asserting char offsets are correct # {'token':[(start, end),...],...} self.token_to_offset = token_to_offset # contains sentence level information extracted by newsreader self.sentence_features = sentence_features # dependency paths for sentences in the document self.dependency_paths = dependency_paths # map token ids to tokens within self.tokenized_text # {'wid':'token'} self.id_to_tok = id_to_tok self.discourse_connectives = {} self.iob_labels = [] self.semLinks = [] # get list of [{'entity_id': 10002, 'entity_label': 'Component-Whole(e2,e1)', 'entity_type': 'e2'}, ...] self.get_labels() """
def __init__(self, string, fret, interval=c.INTVL_ROOT): self.set_string(string) self.set_fret(fret) note = string.get_note_at_fret(fret) Note.__init__(self, note.get_name(), note.get_accidental(), interval)
def __init__(self, timeml_note_path, annotated_timeml_path=None, verbose=False): if verbose: print "called TimeNote constructor" _Note = Note.__init__(self, timeml_note_path, annotated_timeml_path) # get body of document data = get_text(timeml_note_path) # original text body of timeml doc self.original_text = data # send body of document to NewsReader pipeline. tokenized_text, token_to_offset, sentence_features, dependency_paths, id_to_tok = pre_processing.pre_process(data, timeml_note_path) # {sentence_num: [{token},...], ...} self.pre_processed_text = tokenized_text # contains the char based offsets generated by tokenizer, used for asserting char offsets are correct # {'token':[(start, end),...],...} self.token_to_offset = token_to_offset # contains sentence level information extracted by newsreader self.sentence_features = sentence_features # dependency paths for sentences in the document self.dependency_paths = dependency_paths # map token ids to tokens within self.tokenized_text # {'wid':'token'} self.id_to_tok = id_to_tok self.discourse_connectives = {} self.iob_labels = [] """ print "\n\nself.original_text:\n\n" print self.original_text print "\n\n" print "self.pre_processed_text:\n\n" print tokenized_text print "\n\n" print "self.token_to_offset:\n\n" print self.token_to_offset print "\n\n" print "self.sentence_features:\n\n" print self.sentence_features print "\n\n" """ self.tlinks = [] if self.annotated_note_path is not None: self.get_tlinked_entities() # will store labels in self.iob_labels self.get_labels()