Exemplo n.º 1
0
    def parse_article(self, html, pmid=None, metadata_dir=None):
        ''' Takes HTML article as input and returns an Article. PMID Can also be 
        passed, which prevents having to scrape it from the article and/or look it 
        up in PubMed. '''

        # Skip rest of processing if this record already exists
        if pmid is not None and self.database.article_exists(
                pmid) and not config.OVERWRITE_EXISTING_ROWS:
            return False

        html = html.decode('utf-8')  # Make sure we're working with unicode
        html = self.decode_html_entities(html)
        soup = BeautifulSoup(html)
        doi = self.extract_doi(soup)
        pmid = self.extract_pmid(soup) if pmid is None else pmid
        metadata = scrape.get_pubmed_metadata(pmid,
                                              store=metadata_dir,
                                              save=True)

        # TODO: add Source-specific delimiting of salient text boundaries--e.g., exclude References
        text = soup.get_text()
        if self.database.article_exists(pmid):
            if config.OVERWRITE_EXISTING_ROWS:
                self.database.delete_article(pmid)
            else:
                return False
        self.article = database.Article(text,
                                        pmid=pmid,
                                        doi=doi,
                                        metadata=metadata)
        return soup
Exemplo n.º 2
0
    def parse_article(self, html, pmid=None, metadata_dir=None):
        ''' Takes HTML article as input and returns an Article. PMID Can also be 
        passed, which prevents having to scrape it from the article and/or look it 
        up in PubMed. '''

        # Skip rest of processing if this record already exists
        if pmid is not None and self.database.article_exists(pmid) and not config.OVERWRITE_EXISTING_ROWS:
            return False

        html = html.decode('utf-8')   # Make sure we're working with unicode
        html = self.decode_html_entities(html)
        soup = BeautifulSoup(html)
        doi = self.extract_doi(soup)
        pmid = self.extract_pmid(soup) if pmid is None else pmid
        metadata = scrape.get_pubmed_metadata(pmid, store=metadata_dir, save=True)

        # TODO: add Source-specific delimiting of salient text boundaries--e.g., exclude References
        text = soup.get_text()
        if self.database.article_exists(pmid):
            if config.OVERWRITE_EXISTING_ROWS:
                self.database.delete_article(pmid)
            else:
                return False
        self.article = database.Article(text, pmid=pmid, doi=doi, metadata=metadata)
        return soup
Exemplo n.º 3
0
 def update_metadata_from_pubmed(self, pmid):
     pmd = scrape.get_pubmed_metadata(pmid)
     self.id = int(pmid)
     self.title = pmd['title']
     self.journal = pmd['journal']
     self.pubmed_metadata = pmd
     self.year = pmd['year']
     self.abstract = pmd['abstract']
     self.citation = pmd['citation']
Exemplo n.º 4
0
 def update_metadata_from_pubmed(self, pmid):
     pmd = scrape.get_pubmed_metadata(pmid)
     self.id = int(pmid)
     self.title = pmd['title']
     self.journal = pmd['journal']
     self.pubmed_metadata = pmd
     self.year = pmd['year']
     self.abstract = pmd['abstract']
     self.citation = pmd['citation']