def load_corpus(self, corenlpserver, process=True): # self.path is the base directory of the files of this corpus trainfiles = [self.path + '/' + f for f in os.listdir(self.path) if f.endswith('.txt')] total = len(trainfiles) widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()] pbar = pb.ProgressBar(widgets=widgets, maxval=total, redirect_stdout=True).start() time_per_abs = [] for current, f in enumerate(trainfiles): #logging.debug('%s:%s/%s', f, current + 1, total) print '{}:{}/{}'.format(f, current + 1, total) did = f.split(".")[0] t = time.time() with open(f, 'r') as txt: doctext = txt.read() newdoc = Document(doctext, process=False, did=did) newdoc.sentence_tokenize("biomedical") if process: newdoc.process_document(corenlpserver, "biomedical") self.documents[newdoc.did] = newdoc abs_time = time.time() - t time_per_abs.append(abs_time) #logging.info("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time)) pbar.update(current+1) pbar.finish() abs_avg = sum(time_per_abs)*1.0/len(time_per_abs) logging.info("average time per abstract: %ss" % abs_avg)
def load_corpus(self, corenlpserver, process=True): trainfiles = [self.path + '/' + f for f in os.listdir(self.path)] total = len(trainfiles) widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()] pbar = pb.ProgressBar(widgets=widgets, maxval=total, redirect_stdout=True).start() time_per_abs = [] for current, f in enumerate(trainfiles): #logging.debug('%s:%s/%s', f, current + 1, total) print '{}:{}/{}'.format(f, current + 1, total) did = f t = time.time() with open(f, 'r') as f: article = "<Article>" + f.read() + "</Article>" soup = BeautifulSoup(article, 'xml') #doc = soup.find_all("article") title = soup.ArticleTitle.get_text() abstract = soup.AbstractText.get_text() doc_text = title + " " + abstract newdoc = Document(doc_text, process=False, did=did) newdoc.sentence_tokenize("biomedical") newdoc.process_document(corenlpserver, "biomedical") #logging.info(len(newdoc.sentences)) self.documents[newdoc.did] = newdoc abs_time = time.time() - t time_per_abs.append(abs_time) logging.debug("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time)) pbar.update(current) pbar.finish() abs_avg = sum(time_per_abs)*1.0/len(time_per_abs) logging.info("average time per abstract: %ss" % abs_avg)
def load_corpus(self, corenlpserver, process=True): # self.path is the base directory of the files of this corpus trainfiles = [self.path + '/' + f for f in os.listdir(self.path) if f.endswith('.txt')] total = len(trainfiles) widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()] pbar = pb.ProgressBar(widgets=widgets, maxval=total, redirect_stdout=True).start() time_per_abs = [] for current, f in enumerate(trainfiles): #logging.debug('%s:%s/%s', f, current + 1, total) print ('{}:{}/{}'.format(f, current + 1, total)) did = f.split(".")[0].split("/")[-1] t = time.time() with codecs.open(f, 'r', 'utf-8') as txt: doctext = txt.read() doctext = doctext.replace("\n", " ") newdoc = Document(doctext, process=False, did=did) newdoc.sentence_tokenize("biomedical") if process: newdoc.process_document(corenlpserver, "biomedical") self.documents[newdoc.did] = newdoc abs_time = time.time() - t time_per_abs.append(abs_time) #logging.info("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time)) pbar.update(current+1) pbar.finish() abs_avg = sum(time_per_abs)*1.0/len(time_per_abs) logging.info("average time per abstract: %ss" % abs_avg)
def load_corpus(self, corenlpserver, process=True): """Load the CHEMDNER corpus file on the dir element""" # open filename and parse lines total_lines = sum(1 for line in open(self.path)) widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA(), ' ', pb.Timer()] pbar = pb.ProgressBar(widgets=widgets, maxval=total_lines).start() n_lines = 1 time_per_abs = [] with io.open(self.path, 'r', encoding="utf-8") as inputfile: for line in inputfile: t = time.time() # each line is PMID title abs tsv = line.split('\t') doctext = tsv[1].strip().replace("<", "(").replace(">", ")") + " " doctext += tsv[2].strip().replace("<", "(").replace(">", ")") newdoc = Document(doctext, process=False, did=tsv[0], title=tsv[1].strip() + ".") newdoc.sentence_tokenize("biomedical") if process: newdoc.process_document(corenlpserver, "biomedical") self.documents[newdoc.did] = newdoc abs_time = time.time() - t time_per_abs.append(abs_time) pbar.update(n_lines) n_lines += 1 pbar.finish() abs_avg = sum(time_per_abs)*1.0/len(time_per_abs) logging.info("average time per abstract: %ss" % abs_avg)
def load_corpus(self, corenlpserver, process=True): """Load the CHEMDNER corpus file on the dir element""" # open filename and parse lines total_lines = sum(1 for line in open(self.path)) widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA(), ' ', pb.Timer()] pbar = pb.ProgressBar(widgets=widgets, maxval=total_lines).start() n_lines = 1 time_per_abs = [] with io.open(self.path, 'r', encoding="utf-8") as inputfile: for line in inputfile: t = time.time() # each line is PMID title abs tsv = line.split('\t') doctext = tsv[1].strip().replace("<", "(").replace(">", ")").replace(". ", ", ") + ". " doctext += tsv[2].strip().replace("<", "(").replace(">", ")") newdoc = Document(doctext, process=False, did=tsv[0], title=tsv[1].strip() + ".") newdoc.sentence_tokenize("biomedical") if process: newdoc.process_document(corenlpserver, "biomedical") self.documents[newdoc.did] = newdoc abs_time = time.time() - t time_per_abs.append(abs_time) pbar.update(n_lines) n_lines += 1 pbar.finish() abs_avg = sum(time_per_abs)*1.0/len(time_per_abs) logging.info("average time per abstract: %ss" % abs_avg)
def generate_corpus(self, text): """ Create a corpus object from the input text. :param text: :return: """ test_corpus = Corpus("") newdoc = Document(text, process=False, did="d0", title="Test document") newdoc.sentence_tokenize("biomedical") newdoc.process_document(self.corenlp, "biomedical") test_corpus.documents["d0"] = newdoc return test_corpus
def create_sentences(self, doctag, text): # Create sentence entries based on text from document doctag cur = self.db_conn.cursor() newdoc = Document(text, process=False, did=doctag) newdoc.sentence_tokenize("biomedical") for i, sentence in enumerate(newdoc.sentences): corenlpres = sentence.process_sentence(self.corenlp) query = """INSERT INTO sentence(senttag, doctag, senttext, sentoffset, corenlp) VALUES (%s, %s, %s, %s, %s);""" try: cur.execute(query, (sentence.sid, doctag, sentence.text.encode("utf8"), sentence.offset, str(corenlpres).encode("utf8"))) self.db_conn.commit() #inserted_id = cur.lastrowid #return str(inserted_id) except MySQLdb.MySQLError as e: self.db_conn.rollback() logging.debug(e)