def load_corpus(self, corenlpserver): # self.path is the base directory of the files of this corpus # if more than one file: trainfiles = [self.path + f for f in os.listdir(self.path) if not f.endswith('~')] # opens all files in folder (see config file) widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', ' ', pb.Timer()] pbar = pb.ProgressBar(widgets=widgets, maxval=len(trainfiles)).start() for i, openfile in enumerate(trainfiles): # print("file: "+openfile) with open(openfile, 'r') as inputfile: newdoc = Document(inputfile.read(), process=False, did=os.path.basename(openfile), title = "titulo_"+os.path.basename(openfile)) newdoc.process_document(corenlpserver, "biomedical") #process_document chama o tokenizer valid = True invalid_sids = [] for s in newdoc.sentences: if s.text in ['[start section id="{}"]'.format(section) for section in self.invalid_sections]: valid = False if not valid: invalid_sids.append(s.sid) if s.text in ['[end section id="{}"]'.format(section) for section in self.invalid_sections]: valid = True if (s.text.startswith("[") and s.text.endswith("]")) or s.text.istitle(): newdoc.title_sids.append(s.sid) newdoc.invalid_sids = invalid_sids logging.debug("invalid sentences: {}".format(invalid_sids)) logging.debug("title sentences: {}".format(newdoc.title_sids)) self.documents[newdoc.did] = newdoc pbar.update(i+1)