Exemplo n.º 1
0
def produce_report(report_uuid, reportformat, download=False, PICO_vectors=True):
    c = rr_sql_conn.cursor()
    articles, article_ids = [], []
    error_messages = [] # accumulate any errors over articles
    for i, row in enumerate(c.execute("SELECT pdf_uuid, annotations FROM article WHERE report_uuid=?", (report_uuid,))):
        data = MultiDict()
        data.load_json(row[1])
        articles.append(data)
        article_ids.append(row[0])


    if reportformat=='html' or reportformat=='doc':
        # embeddings only relatively meaningful; do not generate
        # if we have only 1 study.
        if sum([(not article.get('_parse_error', False)) for article in articles]) < 2:
            # i.e. if we have fewer than 2 good articles then skip
            PICO_vectors = False

        pico_plot_html = u""
        if PICO_vectors:
            study_names, p_vectors, i_vectors, o_vectors = [], [], [], []
            p_words, i_words, o_words = [], [], []
            for article in articles:
                if article.get('_parse_error'):
                    # need to make errors record more systematically
                    error_messages.append("{0}<br/>".format(get_study_name(article)))

                else:
                    study_names.append(get_study_name(article))
                    p_vectors.append(np.array(article.ml["p_vector"]))
                    p_words.append(article.ml["p_words"])

                    i_vectors.append(np.array(article.ml["i_vector"]))
                    i_words.append(article.ml["i_words"])

                    o_vectors.append(np.array(article.ml["o_vector"]))
                    o_words.append(article.ml["o_words"])


            vectors_d = {"population":np.vstack(p_vectors),
                         "intervention":np.vstack(i_vectors),
                         "outcomes":np.vstack(o_vectors)}

            words_d = {"population":p_words, "intervention":i_words, "outcomes":o_words}

            pico_plot_html = bots["pico_viz_bot"].generate_2d_viz(study_names, vectors_d, words_d,
                                            "{0}-PICO-embeddings".format(report_uuid))


        return render_template('reportview.{}'.format(reportformat), headers=bots['bias_bot'].get_domains(), articles=articles,
                                pico_plot=pico_plot_html, report_uuid=report_uuid, online=(not download),
                                errors=error_messages, reportformat=reportformat)
    elif reportformat=='json':
        return json.dumps({"article_ids": article_ids,
                           "article_data": [a.visible_data() for a in articles],
                           "report_id": report_uuid,
                           })
    else:
        raise Exception('format "{}" was requested but not available'.format(reportformat))
Exemplo n.º 2
0
 def test_annotate(self):
     ''' test for PubmedRobot.annotate(data) '''
     md = MultiDict()
     with open(self.ex_file) as testdata:
         data = json.load(testdata)
     test = data["annotate"]
     md.data["gold"]["title"] = data["title"]
     out = self.pr.annotate(md)
     self.assertEqual(out.data["pubmed"], test)
Exemplo n.º 3
0
def get_marginalia(report_uuid, pdf_uuid):
    # calculates marginalia from database by pdf_uuid
    # where the report_uuid also matches
    annotation_type = request.args["annotation_type"]
    c = rr_sql_conn.cursor()
    c.execute("SELECT annotations FROM article WHERE report_uuid=? AND pdf_uuid=?", (report_uuid, pdf_uuid))
    annotation_json = c.fetchone()
    data = MultiDict()
    data.load_json(annotation_json[0])
    marginalia = bots[annotation_type].get_marginalia(data)
    return json.dumps(marginalia)
Exemplo n.º 4
0
 def test_get_study_name(self):
     test_json = '{"gold": {                                                  \
         "authors": [                                                         \
             {"forename": "Carlos", "lastname": "Noronha", "initials": "C"},  \
             {"forename": "Neto C", "lastname": "", "initials": "NC"},        \
             {"forename": "Sabina S B", "lastname": "Maia", "initials": "SS"} \
         ]}}'
     mdict = MultiDict()
     mdict.load_json(test_json)
     study_name = "Noronha et al."
     self.assertEqual(get_study_name(mdict), study_name)
     test_json = '{"gold": {                                                  \
         "authors": [                                                         \
             {"forename": "Carlos", "lastname": "Noronha", "initials": "C"},  \
             {"forename": "Neto C", "lastname": "", "initials": "NC"}         \
         ]}}'
     mdict.load_json(test_json)
     study_name = "Noronha et al."
     self.assertEqual(get_study_name(mdict), study_name)
     test_json = '{"gold": {                                                \
         "authors": [                                                       \
             {"forename": "Carlos", "lastname": "Noronha", "initials": "C"} \
         ]}}'
     mdict.load_json(test_json)
     study_name = "Noronha, Carlos C."
     self.assertEqual(get_study_name(mdict), study_name)
Exemplo n.º 5
0
 def test_annotate(self):
     ''' test for RCTRobot.annotate(data) '''
     with open(ex_path + "rct.json") as data:
         data = json.load(data)
     md = MultiDict()
     md.data["gold"]["title"] = data["title"]
     md.data["gold"]["abstract"] = data["abstract"]
     md.data["pubmed"] = True
     md = self.rct.annotate(md)
     test = {
         'is_rct': True,
         'model_class': 'svm_cnn_ptyp',
         'decision_score': 7.791535554772796
     }
     self.assertEqual(md.ml["rct"], test)
    def convert(self, pdf_binary):
        """
        returns MultiDict containing document information
        """
        try:
            out = self.parse_xml(self.run_grobid(pdf_binary))
        except Exception as e:
            out = MultiDict() # return empty data if not possible to parse
            log.error(u"Grobid hasn't worked! :(\n exception raised: {}".format(e))
            out.grobid['_parse_error'] = True

        sha1 = hashlib.sha1()
        sha1.update(pdf_binary)
        out.gold['filehash'] = sha1.hexdigest()
        return out
Exemplo n.º 7
0
    def test_annotate(self):
        ''' test for PICOVizRobot.annotate(data) '''
        with open(ex_path + "pico_viz.json") as datafile:
            data = json.load(datafile)
        md = MultiDict()
        md.data["gold"]["abstract"] = data["abstract"]
        md = self.pv.annotate(md)

        p_vector = data["p_vector"]
        self.assertEqual(md.data["ml"]["p_vector"], p_vector)
        p_words = data["p_words"]
        self.assertEqual(md.data["ml"]["p_words"], p_words)
        i_vector = data["i_vector"]
        self.assertEqual(md.data["ml"]["i_vector"], i_vector)
        i_words = data["i_words"]
        self.assertEqual(md.data["ml"]["i_words"], i_words)
        o_vector = data["o_vector"]
        self.assertEqual(md.data["ml"]["o_vector"], o_vector)
        o_words = data["o_words"]
        self.assertEqual(md.data["ml"]["o_words"], o_words)
Exemplo n.º 8
0
    def parse_xml(self, xml_string):
        output = MultiDict()
        full_text_bits = []
        author_list = []
        path = []
        for event, elem in ET.iterparse(StringIO(xml_string.encode('utf-8')),events=("start", "end")):
            if event == 'start':
                path.append(elem.tag)
            elif event == 'end':
                if elem.tag=='{http://www.tei-c.org/ns/1.0}abstract':
                    output.grobid['abstract'] = (self._extract_text(elem))
                elif elem.tag=='{http://www.tei-c.org/ns/1.0}title' and '{http://www.tei-c.org/ns/1.0}titleStmt' in path:
                    output.grobid['title'] = self._extract_text(elem)
                elif elem.tag in ['{http://www.tei-c.org/ns/1.0}head', '{http://www.tei-c.org/ns/1.0}p']:
                    full_text_bits.extend([self._extract_text(elem), '\n'])
                elif elem.tag=='{http://www.tei-c.org/ns/1.0}persName' and '{http://www.tei-c.org/ns/1.0}fileDesc' in path:
                    forenames = [e.text for e in elem.findall('{http://www.tei-c.org/ns/1.0}forename')]
                    lastnames = [e.text for e in elem.findall('{http://www.tei-c.org/ns/1.0}surname')]
                    initials = [f[0] for f in forenames]
                    # NB the format below is identical to that used in pubmed_robot.py
                    author_list.append({"initials": u''.join(initials),
                                        "forename": u' '.join(forenames),
                                        "lastname": u' '.join(lastnames)})
                elif elem.tag=='{http://www.tei-c.org/ns/1.0}date' and elem.attrib.get('type')=='published' and '{http://www.tei-c.org/ns/1.0}fileDesc' in path:
                    DEFAULT = datetime(1800, 1, 1)
                    extracted_date = elem.attrib.get('when')
                    if extracted_date:
                        parsed_date = dateutil.parser.parse(extracted_date)
                        output.grobid["year"] = parsed_date.year
                        output.grobid["month"] = parsed_date.month
                elif elem.tag=='{http://www.tei-c.org/ns/1.0}biblScope' and '{http://www.tei-c.org/ns/1.0}fileDesc' in path:

                    unit = elem.attrib.get('unit')
                    if unit == 'volume':
                        output.grobid["volume"] = elem.text
                    elif unit == 'issue':
                        output.grobid["volume"] = elem.text
                    elif unit == 'page':
                        page_from = elem.attrib.get('from')
                        page_to = elem.attrib.get('to')
                        if page_from:
                            output.grobid["page_from"] = page_from
                        if page_to:
                            output.grobid["page_to"] = page_to
                        if page_from and page_to:
                            output.grobid["pages"] = "{}-{}".format(page_from, page_to)
                elif elem.tag=='{http://www.tei-c.org/ns/1.0}title' and '{http://www.tei-c.org/ns/1.0}fileDesc' in path:

                    output.grobid['journal'] = elem.text
                path.pop()


        output.grobid['text'] = u'\n'.join(full_text_bits)
        output.grobid['authors'] = author_list

        # log.info('author list: %s' % author_list)

        return output