Пример #1
0
    def run_entity_annotator(self, doctag, annotator):
        """
        Classify a document using an annotator and insert results into the database
        :param doctag: tag of the document
        :param annotator: annotator to classify
        :return:
        """
        sentences = self.get_sentences(doctag)
        data = bottle.request.json
        output = {}
        for a in self.entity_annotators:  # a in (annotator_name, annotator_engine, annotator_etype)
            if a[0] == annotator:
                for s in sentences:
                    sentence = Sentence(s[2], offset=s[3], sid=s[1], did=doctag)
                    #sentence.process_sentence(self.corenlp)
                    sentence.process_corenlp_output(ast.literal_eval(s[4]))
                    sentence_text = " ".join([t.text for t in sentence.tokens])
                    sentence_output = self.entity_annotators[a].annotate_sentence(sentence_text)
                    #print sentence_output

                    sentence_entities = self.entity_annotators[a].process_sentence(sentence_output, sentence)
                    for e in sentence_entities:
                        sentence_entities[e].normalize()
                        self.add_entity(sentence_entities[e], annotator)
                        output[e] = str(sentence_entities[e])
                        # print output
        return json.dumps(output)
Пример #2
0
    def run_relation_annotator(self, doctag, annotator):
        """
        Classify a document using an annotator and insert results into the database
        :param doctag: tag of the document
        :param annotator: annotator to classify
        :return:
        """
        # process whole document instead of sentence by sentence
        sentences = self.get_sentences(doctag)
        data = bottle.request.json
        output = {}
        for a in self.relation_annotators:  # a in (annotator_name, annotator_engine, annotator_etype)
            if a[0] == annotator:
                input_sentences = []
                for s in sentences:
                    sentence = Sentence(s[2], offset=s[3], sid=s[1], did=doctag)
                    sentence.process_corenlp_output(ast.literal_eval(s[4]))
                    sentence = self.get_entities(sentence)
                    input_sentences.append(sentence)
                sentence_results = self.relation_annotators[a].annotate_sentences(input_sentences)

                for sentence in input_sentences:
                    if a[1] == "jsre":
                        pred, original = sentence_results[s[1]]
                        sentence_relations = self.relation_annotators[a].process_sentence(pred, original, sentence)
                    elif a[1] == "smil":
                        sentence_relations = self.relation_annotators[a].process_sentence(sentence)
                    for p in sentence_relations:
                        self.add_relation(p, annotator)
                        output[p.pid] = str(p)
        return json.dumps(output)
Пример #3
0
 def get_document(self, doctag):
     # return document entry with doctag
     cur = self.db_conn.cursor()
     query = """SELECT distinct id, doctag, title, doctext
                    FROM document
                    WHERE doctag =%s;"""
     # print "QUERY", query
     cur.execute(query, (doctag, ))
     res = cur.fetchone()
     if res is not None:
         result = {
             'docID': res[1],
             'title': res[2],
             'docText': res[3],
             'abstract': {
                 'sentences': []
             }
         }
         sentences = self.get_sentences(doctag)
         for s in sentences:
             sentence = Sentence(s[2], offset=s[3], sid=s[1], did=doctag)
             sentence.process_corenlp_output(ast.literal_eval(s[4]))
             sentence = self.get_entities(sentence)
             result['abstract']['sentences'].append(sentence.get_dic("all"))
         output = json.dumps(result)
         return output
     else:
         return json.dumps(
             {'error': 'could not find document {}'.format(doctag)})
Пример #4
0
 def sentence_tokenize(self, doctype):
     """
     Split the document text into sentences, add to self.sentences list
     :param doctype: Can be used in the future to choose different methods
     """
     # first sentence should be the title if it exists
     if self.title:
         sid = self.did + ".s0"
         self.sentences.append(Sentence(self.title, sid=sid, did=self.did))
     # inputtext = clean_whitespace(self.text)
     inputtext = self.text
     with codecs.open("/tmp/geniainput.txt", 'w', 'utf-8') as geniainput:
         geniainput.write(inputtext)
     current_dir = os.getcwd()
     os.chdir(geniass_path)
     geniaargs = [
         "./geniass", "/tmp/geniainput.txt", "/tmp/geniaoutput.txt"
     ]
     Popen(geniaargs, stdout=PIPE, stderr=PIPE).communicate()
     os.chdir(current_dir)
     offset = 0
     with codecs.open("/tmp/geniaoutput.txt", 'r', "utf-8") as geniaoutput:
         for l in geniaoutput:
             stext = l.strip()
             if stext == "":
                 offset = self.get_space_between_sentences(offset)
                 continue
             sid = self.did + ".s" + str(len(self.sentences))
             self.sentences.append(
                 Sentence(stext, offset=offset, sid=sid, did=self.did))
             offset += len(stext)
             offset = self.get_space_between_sentences(offset)
Пример #5
0
 def sentence_tokenize(self, doctype):
     """
     Split the document text into sentences, add to self.sentences list
     :param doctype: Can be used in the future to choose different methods
     """
     # first sentence should be the title if it exists
     if self.title:
         sid = self.did + ".s0"
         self.sentences.append(Sentence(self.title, sid=sid, did=self.did))
     # inputtext = clean_whitespace(self.text)
     inputtext = self.text
     if not os.path.exists(geniass_path + '/tmp/'):
         os.mkdir(geniass_path + '/tmp/')
     with codecs.open(geniass_path + "/tmp/geniainput.txt", 'w',
                      'utf-8') as geniainput:
         geniainput.write(inputtext)
     current_dir = os.getcwd()
     os.chdir(geniass_path)
     geniacmd = "geniass.exe tmp/geniainput.txt tmp/geniaoutput.txt"
     call(geniacmd, shell=True)
     os.chdir(current_dir)
     offset = 0
     with codecs.open(geniass_path + "/tmp/geniaoutput.txt", 'r',
                      "utf-8") as geniaoutput:
         for l in geniaoutput:
             stext = l.strip()
             if stext == "":
                 offset = self.get_space_between_sentences(offset)
                 continue
             sid = self.did + ".s" + str(len(self.sentences))
             self.sentences.append(
                 Sentence(stext, offset=offset, sid=sid, did=self.did))
             offset += len(stext)
             offset = self.get_space_between_sentences(offset)
Пример #6
0
 def generate_data(self, corpus, modelname, pairtypes):
     # TODO: remove old model
     pcount = 0
     truepcount = 0
     ns = 0
     for did in corpus.documents:
         doc_entities = corpus.documents[did].get_entities("goldstandard")
         examplelines = []
         # logging.info("{}".format(sentence.sid))
         # sentence_entities = sentence.entities.elist["goldstandard"]
         # logging.debug("sentence {} has {} entities ({})".format(sentence.sid, len(sentence_entities), len(sentence.entities.elist["goldstandard"])))
         for pair in itertools.permutations(doc_entities, 2):
             sn1 = int(pair[0].sid.split(".")[-1][1:])
             sn2 = int(pair[1].sid.split(".")[-1][1:])
             # if self.pairtype in corpus.type_sentences and pair[0].sid not in corpus.type_sentences[self.pairtype]:
             #     continue
             if abs(sn2 - sn1) > 0 or pair[0].start == pair[
                     1].start or pair[0].end == pair[1].end:
                 continue
             # if self.pairtype in ("Has_Sequence_Identical_To", "Is_Functionally_Equivalent_To") and pair[0].type != pair[1].type:
             #     continue
             #if pair[0].text == pair[1].text:
             #    continue
             # logging.info("{}=>{}|{}=>{}".format(pair[0].type, pair[1].type, pairtypes[0], pairtypes[1]))
             if pair[0].type in config.pair_types[self.pairtype][
                     "source_types"] and pair[1].type in config.pair_types[
                         self.pairtype]["target_types"]:
                 #if pair[0].type in config.event_types[self.pairtype]["source_types"] and pair[1].type in config.event_types[self.pairtype]["target_types"]:
                 #pair[1].type in config.pair_types[self.pairtype]["source_types"] and pair[0].type in config.pair_types[self.pairtype]["target_types"]:
                 # logging.debug(pair)
                 #if pair[0].type not in config.pair_types[self.pairtype]["source_types"]:
                 #    pair = (pair[1], pair[0])
                 pid = did + ".p" + str(pcount)
                 # self.pairs[pid] = (e1id, e2id)
                 if sn1 != sn2:
                     sentence1 = corpus.documents[did].get_sentence(
                         pair[0].sid)
                     sentence2 = corpus.documents[did].get_sentence(
                         pair[1].sid)
                     sentence = Sentence(text=sentence1.text + " " +
                                         sentence2.text,
                                         offset=sentence1.offset)
                     sentence.tokens = sentence1.tokens + sentence2.tokens
                     for t in pair[1].tokens:
                         t.order += len(sentence1.tokens)
                 else:
                     sentence = corpus.documents[did].get_sentence(
                         pair[0].sid)
                 f, label = self.generate_features(sentence, pair)
                 self.features.append(f)
                 self.labels.append(label)
                 self.pairs.append(pair)
Пример #7
0
 def load_corpus(self, corenlpserver, process=True):
     total_lines = sum(1 for line in open(self.path))
     widgets = [
         pb.Percentage(), ' ',
         pb.Bar(), ' ',
         pb.ETA(), ' ',
         pb.Timer()
     ]
     pbar = pb.ProgressBar(widgets=widgets,
                           maxval=total_lines,
                           redirect_stdout=True).start()
     time_per_abs = []
     with codecs.open(self.path, 'r', "utf-8") as trainfile:
         current = 0
         for line in trainfile:
             #logging.debug('%s:%s/%s', f, current + 1, total)
             x = line.strip().split(" ")
             did = x[0]
             doctext = " ".join(x[1:])
             newdoc = Document(doctext, process=False, did=did)
             #newdoc.sentence_tokenize("biomedical")
             sid = did + ".s0"
             newdoc.sentences.append(
                 Sentence(doctext, offset=0, sid=sid, did=did))
             if process:
                 newdoc.process_document(corenlpserver, "biomedical")
             self.documents[newdoc.did] = newdoc
             # abs_time = time.time() - t
             # time_per_abs.append(abs_time)
             #logging.info("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time))
             pbar.update(current + 1)
             current += 1
     pbar.finish()
Пример #8
0
 def load_corpus(self, corenlpserver, process=True):
     widgets = [
         pb.Percentage(), ' ',
         pb.Bar(), ' ',
         pb.ETA(), ' ',
         pb.Timer()
     ]
     nlines = 0
     with open(self.path) as f:
         for nlines, l in enumerate(f):
             pass
     print nlines
     pbar = pb.ProgressBar(widgets=widgets, maxval=nlines).start()
     with codecs.open(self.path, 'r', "utf-8") as corpusfile:
         doc_text = ""
         sentences = []
         for i, l in enumerate(corpusfile):
             if l.startswith("###"):  # new doc
                 if doc_text != "":
                     logging.debug("creating document: {}".format(doc_text))
                     newdoc = Document(doc_text, process=False, did=did)
                     newdoc.sentences = sentences[:]
                     newdoc.process_document(corenlpserver, "biomedical")
                     # logging.info(len(newdoc.sentences))
                     self.documents[newdoc.did] = newdoc
                     doc_text = ""
                 did = "JNLPBA" + l.strip().split(":")[-1]
                 logging.debug("starting new document:" + did)
                 sentence_text = ""
                 doc_offset = 0
                 sentences = []
             elif l.strip() == "" and sentence_text != "":  # new sentence
                 #logging.debug("creating mew sentence: {}".format(sentence_text))
                 sid = did + ".s" + str(len(sentences))
                 this_sentence = Sentence(sentence_text,
                                          offset=doc_offset,
                                          sid=sid,
                                          did=did)
                 doc_offset += len(sentence_text) + 1
                 doc_text += sentence_text + " "
                 sentences.append(this_sentence)
                 if i == nlines:
                     logging.debug("creating document: {}".format(doc_text))
                     newdoc = Document(doc_text, process=False, did=did)
                     newdoc.sentences = sentences[:]
                     newdoc.process_document(corenlpserver, "biomedical")
                     # logging.info(len(newdoc.sentences))
                     self.documents[newdoc.did] = newdoc
                     doc_text = ""
                 # start new sentence
                 sentence_text = ""
             else:
                 #logging.debug(str(i) + "/" + str(l))
                 t = l.strip().split("\t")
                 if sentence_text != "":
                     sentence_text += " "
                 #if t[1] == "B-protein"
                 sentence_text += t[0]
             pbar.update(i)
         pbar.finish()
Пример #9
0
 def generate_data(self, corpus, modelname, pairtypes):
    # TODO: remove old model
     pcount = 0
     truepcount = 0
     ns = 0
     for did in corpus.documents:
         doc_entities = corpus.documents[did].get_entities("goldstandard")
         examplelines = []
         # logging.info("{}".format(sentence.sid))
         # sentence_entities = sentence.entities.elist["goldstandard"]
         # logging.debug("sentence {} has {} entities ({})".format(sentence.sid, len(sentence_entities), len(sentence.entities.elist["goldstandard"])))
         for pair in itertools.permutations(doc_entities, 2):
             sn1 = int(pair[0].sid.split(".")[-1][1:])
             sn2 = int(pair[1].sid.split(".")[-1][1:])
             # if self.pairtype in corpus.type_sentences and pair[0].sid not in corpus.type_sentences[self.pairtype]:
             #     continue
             if abs(sn2 - sn1) > 0 or pair[0].start == pair[1].start or pair[0].end == pair[1].end:
                 continue
             # if self.pairtype in ("Has_Sequence_Identical_To", "Is_Functionally_Equivalent_To") and pair[0].type != pair[1].type:
             #     continue
             #if pair[0].text == pair[1].text:
             #    continue
             # logging.info("{}=>{}|{}=>{}".format(pair[0].type, pair[1].type, pairtypes[0], pairtypes[1]))
             if pair[0].type in config.seedev_types.pair_types[self.pairtype]["source_types"] and pair[1].type in config.seedev_types.pair_types[self.pairtype]["target_types"]:
             #if pair[0].type in config.event_types[self.pairtype]["source_types"] and pair[1].type in config.event_types[self.pairtype]["target_types"]:
                                     #pair[1].type in config.pair_types[self.pairtype]["source_types"] and pair[0].type in config.pair_types[self.pairtype]["target_types"]:
                 # logging.debug(pair)
                 #if pair[0].type not in config.pair_types[self.pairtype]["source_types"]:
                 #    pair = (pair[1], pair[0])
                 pid = did + ".p" + str(pcount)
                 # self.pairs[pid] = (e1id, e2id)
                 if sn1 != sn2:
                     sentence1 = corpus.documents[did].get_sentence(pair[0].sid)
                     sentence2 = corpus.documents[did].get_sentence(pair[1].sid)
                     sentence = Sentence(text = sentence1.text + " " + sentence2.text, offset=sentence1.offset)
                     sentence.tokens = sentence1.tokens + sentence2.tokens
                     for t in pair[1].tokens:
                         t.order += len(sentence1.tokens)
                 else:
                     sentence = corpus.documents[did].get_sentence(pair[0].sid)
                 f, label = self.generate_features(sentence, pair)
                 self.features.append(f)
                 self.labels.append(label)
                 self.pairs.append(pair)
Пример #10
0
 def get_document(self, doctag):
     # return document entry with doctag
     cur = self.db_conn.cursor()
     query = """SELECT distinct id, doctag, title, doctext
                    FROM document
                    WHERE doctag =%s;"""
     # print "QUERY", query
     cur.execute(query, (doctag,))
     res = cur.fetchone()
     if res is not None:
         result = {'docID': res[1], 'title': res[2], 'docText': res[3], 'abstract':{'sentences':[]}}
         sentences = self.get_sentences(doctag)
         for s in sentences:
             sentence = Sentence(s[2], offset=s[3], sid=s[1], did=doctag)
             sentence.process_corenlp_output(ast.literal_eval(s[4]))
             sentence = self.get_entities(sentence)
             result['abstract']['sentences'].append(sentence.get_dic("all"))
         output = json.dumps(result)
         return output
     else:
         return json.dumps({'error': 'could not find document {}'.format(doctag)})
Пример #11
0
def pos_tag(text):
    """
    Tokenize a given text and generates a list of Sentence objects.

    :param text: Tokenizes a given text and generates a list of Sentence objects, with the appropiate POS-tags added.
    :return: A list of Sentence objects representing the sentences in the text.
    """
    sentences = []
    for count, sentence in enumerate(nltk.sent_tokenize(text)):
        tokens = OrderedDict()
        # get the tokens and POS tags
        for word, tag in nltk.pos_tag(nltk.word_tokenize(sentence)):
            tokens[word] = tag
            # sentence is now tokenized and tokens have POS tags
        sentences.append(Sentence(count, tokens))
    return sentences
Пример #12
0
    def load_corpus(self, corenlpserver, process=True):

        soup = BeautifulSoup(codecs.open(self.path, 'r', "utf-8"),
                             'html.parser')
        docs = soup.find_all("article")
        widgets = [
            pb.Percentage(), ' ',
            pb.Bar(), ' ',
            pb.ETA(), ' ',
            pb.Timer()
        ]
        pbar = pb.ProgressBar(widgets=widgets, maxval=len(docs)).start()
        n_lines = 1
        time_per_abs = []
        for doc in docs:
            did = "GENIA" + doc.articleinfo.bibliomisc.text.split(":")[1]
            title = doc.title.sentence.get_text()
            sentences = doc.abstract.find_all("sentence")
            doc_sentences = []
            doc_text = title + " "
            doc_offset = 0
            for si, s in enumerate(sentences):
                t = time.time()
                stext = s.get_text()
                sid = did + ".s" + str(si)
                doc_text += stext + " "
                this_sentence = Sentence(stext,
                                         offset=doc_offset,
                                         sid=sid,
                                         did=did)
                doc_offset = len(doc_text)
                doc_sentences.append(this_sentence)
            newdoc = Document(doc_text, process=False, did=did)
            newdoc.sentences = doc_sentences[:]
            newdoc.process_document(corenlpserver, "biomedical")
            #logging.info(len(newdoc.sentences))
            self.documents[newdoc.did] = newdoc
            abs_time = time.time() - t
            time_per_abs.append(abs_time)
            logging.debug("%s sentences, %ss processing time" %
                          (len(newdoc.sentences), abs_time))
            pbar.update(n_lines)
            n_lines += 1
        pbar.finish()
        abs_avg = sum(time_per_abs) * 1.0 / len(time_per_abs)
        logging.info("average time per abstract: %ss" % abs_avg)
Пример #13
0
 def load_corpus(self, corenlpserver):
     # self.path is the base directory of the files of this corpus
     trainfiles = [
         self.path + '/' + f for f in os.listdir(self.path)
         if f.endswith('.xml')
     ]
     total = len(trainfiles)
     current = 0
     time_per_abs = []
     for f in trainfiles:
         logging.debug('%s:%s/%s', f, current + 1, total)
         current += 1
         with open(f, 'r') as xml:
             #parse DDI corpus file
             t = time.time()
             root = ET.fromstring(xml.read())
             doctext = ""
             did = root.get('id')
             doc_sentences = []  # get the sentences of this document
             doc_offset = 0  # offset of the current sentence relative to the document
             for sentence in root.findall('sentence'):
                 sid = sentence.get('id')
                 #logging.info(sid)
                 text = sentence.get('text')
                 text = text.replace('\r\n', '  ')
                 doctext += " " + text  # generate the full text of this document
                 this_sentence = Sentence(text,
                                          offset=doc_offset,
                                          sid=sid,
                                          did=did)
                 doc_offset = len(doctext)
                 doc_sentences.append(this_sentence)
             #logging.info(len(doc_sentences))
             newdoc = Document(doctext, process=False, did=did)
             newdoc.sentences = doc_sentences[:]
             newdoc.process_document(corenlpserver, "biomedical")
             #logging.info(len(newdoc.sentences))
             self.documents[newdoc.did] = newdoc
             abs_time = time.time() - t
             time_per_abs.append(abs_time)
             logging.info("%s sentences, %ss processing time" %
                          (len(newdoc.sentences), abs_time))
     abs_avg = sum(time_per_abs) * 1.0 / len(time_per_abs)
     logging.info("average time per abstract: %ss" % abs_avg)
Пример #14
0
 def load_corpus(self, corenlpserver, process=True):
     # self.path is just one file with every document
     time_per_abs = []
     with open(self.path, 'r') as xml:
         t = time.time()
         root = ET.fromstring(xml.read())
         all_docs = root.findall("document")
         widgets = [
             pb.Percentage(), ' ',
             pb.Bar(), ' ',
             pb.AdaptiveETA(), ' ',
             pb.Timer()
         ]
         pbar = pb.ProgressBar(widgets=widgets,
                               maxval=len(all_docs)).start()
         for i, doc in enumerate(all_docs):
             doctext = ""
             did = doc.get('id')
             doc_sentences = []  # get the sentences of this document
             doc_offset = 0  # offset of the current sentence relative to the document
             for sentence in doc.findall('sentence'):
                 sid = sentence.get('id')
                 #logging.info(sid)
                 text = sentence.get('text')
                 #text = text.replace('\r\n', '  ')
                 doctext += " " + text  # generate the full text of this document
                 this_sentence = Sentence(text,
                                          offset=doc_offset,
                                          sid=sid,
                                          did=did)
                 doc_offset = len(doctext)
                 doc_sentences.append(this_sentence)
             newdoc = Document(doctext, process=False, did=did)
             newdoc.sentences = doc_sentences[:]
             newdoc.process_document(corenlpserver, "biomedical")
             self.documents[newdoc.did] = newdoc
             abs_time = time.time() - t
             time_per_abs.append(abs_time)
             pbar.update(i + 1)
         pbar.finish()
     abs_avg = sum(time_per_abs) * 1.0 / len(time_per_abs)
     logging.info("average time per abstract: %ss" % abs_avg)
Пример #15
0
 def load_corpus(self, corenlpserver, process=True):
     total_lines = sum(1 for line in open(self.path))
     time_per_abs = []
     with codecs.open(self.path, 'r', "utf-8") as trainfile:
         current = 0
         ddi = ""
         for line in trainfile:
             #logging.debug('%s:%s/%s', f, current + 1, total)
             if line.startswith("ID"):
                 did = line.strip().split("\t")[1]
                 print did
             elif line.startswith("sentence"):
                 doctext = line.strip().split("\t")[1]
                 newdoc = Document(doctext, process=False, did=did)
                 sid = did + ".s0"
                 newdoc.sentences.append(
                     Sentence(doctext, offset=0, sid=sid, did=did))
                 if process:
                     newdoc.process_document(corenlpserver)
                 self.documents[newdoc.did] = newdoc