Пример #1
0
 def load_corpus(self, corenlpserver, process=True):
     widgets = [
         pb.Percentage(), ' ',
         pb.Bar(), ' ',
         pb.ETA(), ' ',
         pb.Timer()
     ]
     nlines = 0
     with open(self.path) as f:
         for nlines, l in enumerate(f):
             pass
     print nlines
     pbar = pb.ProgressBar(widgets=widgets, maxval=nlines).start()
     with codecs.open(self.path, 'r', "utf-8") as corpusfile:
         doc_text = ""
         sentences = []
         for i, l in enumerate(corpusfile):
             if l.startswith("###"):  # new doc
                 if doc_text != "":
                     logging.debug("creating document: {}".format(doc_text))
                     newdoc = Document(doc_text, process=False, did=did)
                     newdoc.sentences = sentences[:]
                     newdoc.process_document(corenlpserver, "biomedical")
                     # logging.info(len(newdoc.sentences))
                     self.documents[newdoc.did] = newdoc
                     doc_text = ""
                 did = "JNLPBA" + l.strip().split(":")[-1]
                 logging.debug("starting new document:" + did)
                 sentence_text = ""
                 doc_offset = 0
                 sentences = []
             elif l.strip() == "" and sentence_text != "":  # new sentence
                 #logging.debug("creating mew sentence: {}".format(sentence_text))
                 sid = did + ".s" + str(len(sentences))
                 this_sentence = Sentence(sentence_text,
                                          offset=doc_offset,
                                          sid=sid,
                                          did=did)
                 doc_offset += len(sentence_text) + 1
                 doc_text += sentence_text + " "
                 sentences.append(this_sentence)
                 if i == nlines:
                     logging.debug("creating document: {}".format(doc_text))
                     newdoc = Document(doc_text, process=False, did=did)
                     newdoc.sentences = sentences[:]
                     newdoc.process_document(corenlpserver, "biomedical")
                     # logging.info(len(newdoc.sentences))
                     self.documents[newdoc.did] = newdoc
                     doc_text = ""
                 # start new sentence
                 sentence_text = ""
             else:
                 #logging.debug(str(i) + "/" + str(l))
                 t = l.strip().split("\t")
                 if sentence_text != "":
                     sentence_text += " "
                 #if t[1] == "B-protein"
                 sentence_text += t[0]
             pbar.update(i)
         pbar.finish()
Пример #2
0
 def load_corpus(self, corenlpserver, process=True):
     widgets = [pb.Percentage(), " ", pb.Bar(), " ", pb.ETA(), " ", pb.Timer()]
     nlines = 0
     with open(self.path) as f:
         for nlines, l in enumerate(f):
             pass
     print nlines
     pbar = pb.ProgressBar(widgets=widgets, maxval=nlines).start()
     with codecs.open(self.path, "r", "utf-8") as corpusfile:
         doc_text = ""
         sentences = []
         for i, l in enumerate(corpusfile):
             if l.startswith("###"):  # new doc
                 if doc_text != "":
                     logging.debug("creating document: {}".format(doc_text))
                     newdoc = Document(doc_text, process=False, did=did)
                     newdoc.sentences = sentences[:]
                     newdoc.process_document(corenlpserver, "biomedical")
                     # logging.info(len(newdoc.sentences))
                     self.documents[newdoc.did] = newdoc
                     doc_text = ""
                 did = "JNLPBA" + l.strip().split(":")[-1]
                 logging.debug("starting new document:" + did)
                 sentence_text = ""
                 doc_offset = 0
                 sentences = []
             elif l.strip() == "" and sentence_text != "":  # new sentence
                 # logging.debug("creating mew sentence: {}".format(sentence_text))
                 sid = did + ".s" + str(len(sentences))
                 this_sentence = Sentence(sentence_text, offset=doc_offset, sid=sid, did=did)
                 doc_offset += len(sentence_text) + 1
                 doc_text += sentence_text + " "
                 sentences.append(this_sentence)
                 if i == nlines:
                     logging.debug("creating document: {}".format(doc_text))
                     newdoc = Document(doc_text, process=False, did=did)
                     newdoc.sentences = sentences[:]
                     newdoc.process_document(corenlpserver, "biomedical")
                     # logging.info(len(newdoc.sentences))
                     self.documents[newdoc.did] = newdoc
                     doc_text = ""
                 # start new sentence
                 sentence_text = ""
             else:
                 # logging.debug(str(i) + "/" + str(l))
                 t = l.strip().split("\t")
                 if sentence_text != "":
                     sentence_text += " "
                 # if t[1] == "B-protein"
                 sentence_text += t[0]
             pbar.update(i)
         pbar.finish()
Пример #3
0
 def load_corpus(self, corenlpserver, process=True):
     # self.path is just one file with every document
     time_per_abs = []
     with open(self.path, 'r') as xml:
         t = time.time()
         root = ET.fromstring(xml.read())
         all_docs = root.findall("document")
         widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()]
         pbar = pb.ProgressBar(widgets=widgets, maxval=len(all_docs)).start()
         for i, doc in enumerate(all_docs):
             doctext = ""
             did = doc.get('id')
             doc_sentences = [] # get the sentences of this document
             doc_offset = 0 # offset of the current sentence relative to the document
             for sentence in doc.findall('sentence'):
                 sid = sentence.get('id')
                 #logging.info(sid)
                 text = sentence.get('text')
                 #text = text.replace('\r\n', '  ')
                 doctext += " " + text # generate the full text of this document
                 this_sentence = Sentence(text, offset=doc_offset, sid=sid, did=did)
                 doc_offset = len(doctext)
                 doc_sentences.append(this_sentence)
             newdoc = Document(doctext, process=False, did=did)
             newdoc.sentences = doc_sentences[:]
             newdoc.process_document(corenlpserver, "biomedical")
             self.documents[newdoc.did] = newdoc
             abs_time = time.time() - t
             time_per_abs.append(abs_time)
             pbar.update(i+1)
         pbar.finish()
     abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
     logging.info("average time per abstract: %ss" % abs_avg)
Пример #4
0
    def load_corpus(self, corenlpserver, process=True):

        soup = BeautifulSoup(codecs.open(self.path, 'r', "utf-8"),
                             'html.parser')
        docs = soup.find_all("article")
        widgets = [
            pb.Percentage(), ' ',
            pb.Bar(), ' ',
            pb.ETA(), ' ',
            pb.Timer()
        ]
        pbar = pb.ProgressBar(widgets=widgets, maxval=len(docs)).start()
        n_lines = 1
        time_per_abs = []
        for doc in docs:
            did = "GENIA" + doc.articleinfo.bibliomisc.text.split(":")[1]
            title = doc.title.sentence.get_text()
            sentences = doc.abstract.find_all("sentence")
            doc_sentences = []
            doc_text = title + " "
            doc_offset = 0
            for si, s in enumerate(sentences):
                t = time.time()
                stext = s.get_text()
                sid = did + ".s" + str(si)
                doc_text += stext + " "
                this_sentence = Sentence(stext,
                                         offset=doc_offset,
                                         sid=sid,
                                         did=did)
                doc_offset = len(doc_text)
                doc_sentences.append(this_sentence)
            newdoc = Document(doc_text, process=False, did=did)
            newdoc.sentences = doc_sentences[:]
            newdoc.process_document(corenlpserver, "biomedical")
            #logging.info(len(newdoc.sentences))
            self.documents[newdoc.did] = newdoc
            abs_time = time.time() - t
            time_per_abs.append(abs_time)
            logging.debug("%s sentences, %ss processing time" %
                          (len(newdoc.sentences), abs_time))
            pbar.update(n_lines)
            n_lines += 1
        pbar.finish()
        abs_avg = sum(time_per_abs) * 1.0 / len(time_per_abs)
        logging.info("average time per abstract: %ss" % abs_avg)
Пример #5
0
 def load_corpus(self, corenlpserver):
     # self.path is the base directory of the files of this corpus
     trainfiles = [
         self.path + '/' + f for f in os.listdir(self.path)
         if f.endswith('.xml')
     ]
     total = len(trainfiles)
     current = 0
     time_per_abs = []
     for f in trainfiles:
         logging.debug('%s:%s/%s', f, current + 1, total)
         current += 1
         with open(f, 'r') as xml:
             #parse DDI corpus file
             t = time.time()
             root = ET.fromstring(xml.read())
             doctext = ""
             did = root.get('id')
             doc_sentences = []  # get the sentences of this document
             doc_offset = 0  # offset of the current sentence relative to the document
             for sentence in root.findall('sentence'):
                 sid = sentence.get('id')
                 #logging.info(sid)
                 text = sentence.get('text')
                 text = text.replace('\r\n', '  ')
                 doctext += " " + text  # generate the full text of this document
                 this_sentence = Sentence(text,
                                          offset=doc_offset,
                                          sid=sid,
                                          did=did)
                 doc_offset = len(doctext)
                 doc_sentences.append(this_sentence)
             #logging.info(len(doc_sentences))
             newdoc = Document(doctext, process=False, did=did)
             newdoc.sentences = doc_sentences[:]
             newdoc.process_document(corenlpserver, "biomedical")
             #logging.info(len(newdoc.sentences))
             self.documents[newdoc.did] = newdoc
             abs_time = time.time() - t
             time_per_abs.append(abs_time)
             logging.info("%s sentences, %ss processing time" %
                          (len(newdoc.sentences), abs_time))
     abs_avg = sum(time_per_abs) * 1.0 / len(time_per_abs)
     logging.info("average time per abstract: %ss" % abs_avg)
Пример #6
0
 def load_corpus(self, corenlpserver, process=True):
     # self.path is just one file with every document
     time_per_abs = []
     with open(self.path, 'r') as xml:
         t = time.time()
         root = ET.fromstring(xml.read())
         all_docs = root.findall("document")
         widgets = [
             pb.Percentage(), ' ',
             pb.Bar(), ' ',
             pb.AdaptiveETA(), ' ',
             pb.Timer()
         ]
         pbar = pb.ProgressBar(widgets=widgets,
                               maxval=len(all_docs)).start()
         for i, doc in enumerate(all_docs):
             doctext = ""
             did = doc.get('id')
             doc_sentences = []  # get the sentences of this document
             doc_offset = 0  # offset of the current sentence relative to the document
             for sentence in doc.findall('sentence'):
                 sid = sentence.get('id')
                 #logging.info(sid)
                 text = sentence.get('text')
                 #text = text.replace('\r\n', '  ')
                 doctext += " " + text  # generate the full text of this document
                 this_sentence = Sentence(text,
                                          offset=doc_offset,
                                          sid=sid,
                                          did=did)
                 doc_offset = len(doctext)
                 doc_sentences.append(this_sentence)
             newdoc = Document(doctext, process=False, did=did)
             newdoc.sentences = doc_sentences[:]
             newdoc.process_document(corenlpserver, "biomedical")
             self.documents[newdoc.did] = newdoc
             abs_time = time.time() - t
             time_per_abs.append(abs_time)
             pbar.update(i + 1)
         pbar.finish()
     abs_avg = sum(time_per_abs) * 1.0 / len(time_per_abs)
     logging.info("average time per abstract: %ss" % abs_avg)
Пример #7
0
    def load_corpus(self, corenlpserver, process=True):

        soup = BeautifulSoup(codecs.open(self.path, 'r', "utf-8"), 'html.parser')
        docs = soup.find_all("article")
        widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA(), ' ', pb.Timer()]
        pbar = pb.ProgressBar(widgets=widgets, maxval=len(docs)).start()
        n_lines = 1
        time_per_abs = []
        for doc in docs:
            did = "GENIA" + doc.articleinfo.bibliomisc.text.split(":")[1]
            title = doc.title.sentence.get_text()
            sentences = doc.abstract.find_all("sentence")
            doc_sentences = []
            doc_text = title + " "
            doc_offset = 0
            for si, s in enumerate(sentences):
                t = time.time()
                stext = s.get_text()
                sid = did + ".s" + str(si)
                doc_text += stext + " "
                this_sentence = Sentence(stext, offset=doc_offset, sid=sid, did=did)
                doc_offset = len(doc_text)
                doc_sentences.append(this_sentence)
            newdoc = Document(doc_text, process=False, did=did)
            newdoc.sentences = doc_sentences[:]
            newdoc.process_document(corenlpserver, "biomedical")
            #logging.info(len(newdoc.sentences))
            self.documents[newdoc.did] = newdoc
            abs_time = time.time() - t
            time_per_abs.append(abs_time)
            logging.debug("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time))
            pbar.update(n_lines)
            n_lines += 1
        pbar.finish()
        abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
        logging.info("average time per abstract: %ss" % abs_avg)