Пример #1
0
    def handle(self, *args, **options):
        """Imports a set of DOIs into .

        :param args: None
        :param options: Dictionary containing 'input_file', which should contain a list of DOIs
        :return: None
        """
        dict = csv.DictReader(open(options['input_file'], 'r'))

        with open(options['output_file'], 'w') as out_file:
            out = csv.writer(out_file)

            out.writerow(['DOI', '<item crawler="iParadigms">'])

            for row in dict:
                if 'amazonaws' not in row['URL'] and 'uwp.co.uk' not in row['URL']:
                    page = shared.fetch_page(row['URL'])
                    pdf_url = shared.get_pdf_url(page)
                    out.writerow([row['DOI'], pdf_url])
                    out_file.flush()
Пример #2
0
def import_article(journal, user, url, thumb_path=None):
    """ Import a Ubiquity Press article.

    :param journal: the journal to import to
    :param user: the user who will own the file
    :param url: the URL of the article to import
    :param thumb_path: the base path for thumbnails
    :return: None
    """

    # retrieve the remote page and establish if it has a DOI
    already_exists, doi, domain, soup_object = shared.fetch_page_and_check_if_exists(
        url)
    requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

    if already_exists:
        # if here then this article has already been imported
        return

    # fetch basic metadata
    new_article = shared.get_and_set_metadata(journal, soup_object, user,
                                              False, True)

    # try to do a license lookup
    pattern = re.compile(r'creativecommons')
    license_tag = soup_object.find(href=pattern)
    license_object = models.Licence.objects.filter(
        url=license_tag['href'].replace('http:', 'https:'), journal=journal)

    if len(license_object) > 0 and license_object[0] is not None:
        license_object = license_object[0]
        logger.info("Found a license for this article: {0}".format(
            license_object.short_name))
    else:
        license_object = models.Licence.objects.get(name='All rights reserved',
                                                    journal=journal)
        logger.warning(
            "Did not find a license for this article. Using: {0}".format(
                license_object.short_name))

    new_article.license = license_object

    # determine if the article is peer reviewed
    peer_reviewed = soup_object.find(name='a',
                                     text='Peer Reviewed') is not None
    logger.debug("Peer reviewed: {0}".format(peer_reviewed))

    new_article.peer_reviewed = peer_reviewed

    # get PDF and XML galleys
    pdf = shared.get_pdf_url(soup_object)

    # rip XML out if found
    pattern = re.compile('.*?XML.*')
    xml = soup_object.find('a', text=pattern)
    html = None

    if xml:
        logger.info("Ripping XML")
        xml = xml.get('href', None).strip()
    else:
        # looks like there isn't any XML
        # instead we'll pull out any div with an id of "xml-article" and add as an HTML galley
        logger.info("Ripping HTML")
        html = soup_object.find('div', attrs={'id': 'xml-article'})

        if html:
            html = str(html.contents[0])

    # attach the galleys to the new article
    galleys = {'PDF': pdf, 'XML': xml, 'HTML': html}

    shared.set_article_galleys_and_identifiers(doi, domain, galleys,
                                               new_article, url, user)

    # fetch thumbnails
    if thumb_path is not None:
        logger.info("Attempting to assign thumbnail.")

        final_path_element = url.split('/')[-1]
        id_regex = re.compile(r'.*?(\d+)')
        matches = id_regex.match(final_path_element)
        article_id = matches.group(1)

        logger.info("Determined remote article ID as: {0}".format(article_id))
        logger.info("Thumbnail path: {thumb_path}, URL: {url}".format(
            thumb_path=thumb_path, url=url))

        try:
            filename, mime = shared.fetch_file(domain,
                                               thumb_path + "/" + article_id,
                                               "", 'graphic', new_article,
                                               user)
            shared.add_file(mime,
                            'graphic',
                            'Thumbnail',
                            user,
                            filename,
                            new_article,
                            thumbnail=True)
        except Exception as e:
            logger.warning("Unable to import thumbnail: %s" % e)

    # lookup status
    stats = soup_object.findAll('div', {'class': 'stat-number'})

    # save the article to the database
    new_article.save()

    try:
        if stats:
            from metrics import models as metrics_models
            views = stats[0].contents[0]
            if len(stats) > 1:
                downloads = stats[1].contents[0]
            else:
                downloads = 0

            metrics_models.HistoricArticleAccess.objects.create(
                article=new_article, views=views, downloads=downloads)
    except (IndexError, AttributeError):
        logger.info("No article metrics found")
Пример #3
0
def import_article(journal, user, url, thumb_path=None):
    """ Import a Ubiquity Press article.

    :param journal: the journal to import to
    :param user: the user who will own the file
    :param url: the URL of the article to import
    :param thumb_path: the base path for thumbnails
    :return: None
    """

    # retrieve the remote page and establish if it has a DOI
    already_exists, doi, domain, soup_object = shared.fetch_page_and_check_if_exists(
        url)
    requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

    if already_exists:
        # if here then this article has already been imported
        return

    # fetch basic metadata
    new_article = shared.get_and_set_metadata(journal, soup_object, user,
                                              False, True)

    # try to do a license lookup
    pattern = re.compile(r'creativecommons')
    license_tag = soup_object.find(href=pattern)
    license_object = models.Licence.objects.filter(
        url=license_tag['href'].replace('http:', 'https:'), journal=journal)

    if len(license_object) > 0 and license_object[0] is not None:
        license_object = license_object[0]
        print("Found a license for this article: {0}".format(
            license_object.short_name))
    else:
        license_object = models.Licence.objects.get(name='All rights reserved',
                                                    journal=journal)
        print("Did not find a license for this article. Using: {0}".format(
            license_object.short_name))

    new_article.license = license_object

    # determine if the article is peer reviewed
    peer_reviewed = soup_object.find(name='a',
                                     text='Peer Reviewed') is not None
    print("Peer reviewed: {0}".format(peer_reviewed))

    new_article.peer_reviewed = peer_reviewed

    # get PDF and XML galleys
    pdf = shared.get_pdf_url(soup_object)

    # rip XML out if found
    pattern = re.compile('.*?XML.*')
    xml = soup_object.find('a', text=pattern)
    html = None

    if xml:
        print("Ripping XML")
        xml = xml.get('href', None).strip()
    else:
        # looks like there isn't any XML
        # instead we'll pull out any div with an id of "xml-article" and add as an HTML galley
        print("Ripping HTML")
        html = soup_object.find('div', attrs={'id': 'xml-article'})

        if html:
            html = str(html.contents[0])

    # attach the galleys to the new article
    galleys = {'PDF': pdf, 'XML': xml, 'HTML': html}

    shared.set_article_galleys_and_identifiers(doi, domain, galleys,
                                               new_article, url, user)

    # fetch thumbnails
    if thumb_path is not None:
        print("Attempting to assign thumbnail.")

        id_regex = re.compile(r'.*?(\d+)')
        matches = id_regex.match(url)

        article_id = matches.group(1)

        print("Determined remote article ID as: {0}".format(article_id))

        try:
            filename, mime = shared.fetch_file(domain,
                                               thumb_path + "/" + article_id,
                                               "", 'graphic', new_article,
                                               user)
            shared.add_file(mime,
                            'graphic',
                            'Thumbnail',
                            user,
                            filename,
                            new_article,
                            thumbnail=True)
        except BaseException:
            print("Unable to import thumbnail. Recoverable error.")

    # try to do a license lookup

    # save the article to the database
    new_article.save()