Exemplo n.º 1
0
    def lookUpItem(self, site, keyword):
        siteName = helpers.getDomainName(site.get('url', ''))

        self.totalResults = 0

        articles = []

        # use pubmed's api
        if siteName == 'nih.gov':
            articles = self.nihSearch(site, keyword)
        # use arxiv's api
        elif siteName == 'arxiv.org':
            articles = self.arxivSearch(site, keyword)
        # get the website and parse it
        else:
            siteData = {}

            keywordWithPlusSigns = urllib.parse.quote_plus(keyword)
            keywordWithPlusSigns = keywordWithPlusSigns.replace('%20', '+')

            if siteName == 'biorxiv.org':
                siteData = {
                    'url':
                    f'https://www.biorxiv.org/search/{keywordWithPlusSigns}%20numresults%3A75%20sort%3Arelevance-rank',
                    'resultsXpath':
                    "//a[@class = 'highwire-cite-linked-title']",
                    'totalResultsXpath': "//*[@id = 'search-summary-wrapper']",
                    'titleXpath': "./span[@class = 'highwire-cite-title']",
                    'dateSubmittedXpath':
                    "//div[@class = 'pane-content' and contains(., 'Posted')]",
                    'urlPrefix': 'https://www.biorxiv.org',
                    'afterFirstPageSuffix': '?page={}',
                    'abstractXpath': "//*[@id = 'abstract-1']//*[@id = 'p-2']",
                    'titleInDetailsPageXpath': "//*[@id = 'page-title']"
                }
            elif siteName == 'medrxiv.org':
                siteData = {
                    'url':
                    f'https://www.medrxiv.org/search/{keywordWithPlusSigns}%20numresults%3A75%20sort%3Arelevance-rank',
                    'resultsXpath':
                    "//a[@class = 'highwire-cite-linked-title']",
                    'totalResultsXpath': "//*[@id = 'search-summary-wrapper']",
                    'titleXpath': "./span[@class = 'highwire-cite-title']",
                    'dateSubmittedXpath':
                    "//div[@class = 'pane-content' and contains(., 'Posted')]",
                    'urlPrefix': 'https://www.medrxiv.org',
                    'afterFirstPageSuffix': '?page={}',
                    'abstractXpath': "//*[@id = 'abstract-1']//*[@id = 'p-2']",
                    'titleInDetailsPageXpath': "//*[@id = 'page-title']"
                }

            articles = self.genericSearch(site, keyword, siteData)

        i = 0

        # download all the pdf url's we found
        for article in articles:
            self.outputResult(site, keyword, i + 1, article)

            i += 1
Exemplo n.º 2
0
    def outputResult(self, site, keyword, resultNumber, article):
        siteName = helpers.getDomainName(site.get('url', ''))

        articleId = article[0]
        pdfUrl = article[1]

        downloaded = 'Not downloaded'
        outputFileName = ''

        # log to the csv file anyway
        self.logToCsvFiles(site, keyword, resultNumber, article,
                           outputFileName, downloaded, False, True)

        self.waitBetween()
Exemplo n.º 3
0
    def markDone(self, site, keyword):
        siteName = helpers.getDomainName(site.get('url', ''))

        keyword = keyword.replace("'", "''")

        item = {
            'siteName': siteName,
            'keyword': keyword,
            'directory': self.options['outputDirectory'],
            'gmDate': str(datetime.datetime.utcnow())
        }

        logging.debug(f'Inserting into database')
        logging.debug(item)

        self.database.insert('history', item)
Exemplo n.º 4
0
    def isDone(self, site, keyword):
        result = False

        siteName = helpers.getDomainName(site.get('url', ''))

        keyword = keyword.replace("'", "''")

        directory = self.options['outputDirectory']

        siteName = self.database.getFirst(
            'history', 'siteName',
            f"siteName= '{siteName}' and keyword = '{keyword}' and directory = '{directory}'",
            '', '')

        if siteName:
            logging.info(f'Skipping. Already done this item.')
            result = True

        return result
Exemplo n.º 5
0
    def arxivSearch(self, site, keyword):
        results = []

        maximumResults = self.options['maximumResultsPerKeyword']

        if maximumResults == -1:
            maximumResults = None

        items = arxiv.query(query=keyword,
                            id_list=[],
                            max_results=maximumResults,
                            start=0,
                            sort_by="relevance",
                            sort_order="descending",
                            prune=True,
                            iterative=False,
                            max_chunk_results=1000)

        ids = []

        for item in items:
            id = item.get('id', '')
            id = self.getLastAfterSplit(id, '/')

            # avoids duplicates
            if id in ids:
                continue

            ids.append(id)

            pdfUrl = item.get('pdf_url', '')

            if not pdfUrl:
                siteName = helpers.getDomainName(site.get('url', ''))
                message = f'No pdf file found on {siteName} for {id}'
                logging.error(message)
                pdfUrl = f'Error: {message}'

            title = item.get('title', '')
            title = title.replace('\n', ' ')
            title = self.squeezeWhitespace(title)

            dateSubmitted = item.get('published', '')

            dateSubmitted = helpers.findBetween(dateSubmitted, '', 'T')

            shortTitle = title

            if len(shortTitle) > 50:
                shortTitle = shortTitle[0:50] + '...'

            abstract = item.get('summary', '')

            allAuthors = '; '.join(item.get('authors', ''))
            allLocations = ''
            firstAuthor = self.getFirst(item.get('authors', ''))
            firstAuthorLocation = ''
            lastAuthor = self.getLast(item.get('authors', ''))
            lastAuthorLocation = ''
            citations = ''

            result = [
                id, pdfUrl, title, dateSubmitted, abstract, allAuthors,
                allLocations, firstAuthor, firstAuthorLocation, lastAuthor,
                lastAuthorLocation, citations
            ]

            results.append(result)

            logging.info(
                f'Results: {len(results)}. Id: {id}. Title: {shortTitle}.')

        self.totalResults = len(results)

        self.showResultCount()

        # log the search now because the download might fail
        self.logToCsvFiles(site, keyword, -1, [], '', False, True, False)

        return results
Exemplo n.º 6
0
    def showStatus(self, item, keyword):
        siteName = helpers.getDomainName(item.get('url', ''))

        logging.info(
            f'Site {self.onItemIndex + 1} of {len(self.sites)}: {siteName}. Keyword {self.onKeywordIndex + 1} of {len(self.keywords)}: {keyword}.'
        )