Python fetch_file示例

编程语言: Python

命名空间/包名称: utils.importers.shared

方法/功能: fetch_file

hotexamples.com的示例: 5

Python fetch_file - 已找到5个示例。这些是从开源项目中提取的最受好评的utils.importers.shared.fetch_file现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： up.py 项目： rhb123/janeway

def get_ojs_file(base_url, url, article, auth_file, label):
    filename, mime = shared.fetch_file(base_url, url, None, None, article, None, handle_images=False,
                                       auth_file=auth_file)
    extension = os.path.splitext(filename)[1]
    file = shared.add_file(mime, extension, label, article.owner, filename, article, galley=False)

    return file

示例#2

显示文件

def create_article_with_review_content(article_dict, journal, auth_file,
                                       base_url):
    date_started = timezone.make_aware(
        dateparser.parse(article_dict.get('date_submitted')))

    # Create a base article
    article = models.Article(
        journal=journal,
        title=article_dict.get('title'),
        abstract=article_dict.get('abstract'),
        language=article_dict.get('language'),
        stage=models.STAGE_UNDER_REVIEW,
        is_import=True,
        date_submitted=date_started,
    )

    article.save()

    # Check for editors and assign them as section editors.
    editors = article_dict.get('editors', [])

    for editor in editors:
        try:
            account = core_models.Account.objects.get(email=editor)
            account.add_account_role('section-editor', journal)
            review_models.EditorAssignment.objects.create(
                article=article, editor=account, editor_type='section-editor')
            logger.info('Editor added to article')
        except Exception as e:
            logger.error('Editor account was not found.')
            logger.exception(e)

    # Add a new review round
    round = review_models.ReviewRound.objects.create(article=article,
                                                     round_number=1)

    # Add keywords
    keywords = article_dict.get('keywords')
    if keywords:
        for keyword in keywords.split(';'):
            word, created = models.Keyword.objects.get_or_create(word=keyword)
            article.keywords.add(word)

    # Add authors
    for author in article_dict.get('authors'):
        try:
            author_record = core_models.Account.objects.get(
                email=author.get('email'))
        except core_models.Account.DoesNotExist:
            author_record = core_models.Account.objects.create(
                email=author.get('email'),
                first_name=author.get('first_name'),
                last_name=author.get('last_name'),
                institution=author.get('affiliation'),
                biography=author.get('bio'),
            )

        # If we have a country, fetch its record
        if author.get('country'):
            try:
                country = core_models.Country.objects.get(
                    code=author.get('country'))
                author_record.country = country
                author_record.save()
            except core_models.Country.DoesNotExist:
                pass
        # Add authors to m2m and create an order record
        article.authors.add(author_record)
        models.ArticleAuthorOrder.objects.create(
            article=article,
            author=author_record,
            order=article.next_author_sort())

        # Set the primary author
        article.owner = core_models.Account.objects.get(
            email=article_dict.get('correspondence_author'))
        article.correspondence_author = article.owner

        # Get or create the article's section
        try:
            section = models.Section.objects.language().fallbacks('en').get(
                journal=journal, name=article_dict.get('section'))
        except models.Section.DoesNotExist:
            section = None

        article.section = section

        article.save()

    # Attempt to get the default review form
    form = setting_handler.get_setting('general',
                                       'default_review_form',
                                       journal,
                                       create=True).processed_value

    if not form:
        try:
            form = review_models.ReviewForm.objects.filter(journal=journal)[0]
        except Exception:
            form = None
            logger.error(
                'You must have at least one review form for the journal before'
                ' importing.')
            exit()

    for review in article_dict.get('reviews'):
        try:
            reviewer = core_models.Account.objects.get(
                email=review.get('email'))
        except core_models.Account.DoesNotExist:
            reviewer = core_models.Account.objects.create(
                email=review.get('email'),
                first_name=review.get('first_name'),
                last_name=review.get('last_name'),
            )

        # Parse the dates
        date_requested = timezone.make_aware(
            dateparser.parse(review.get('date_requested')))
        date_due = timezone.make_aware(dateparser.parse(
            review.get('date_due')))
        date_complete = timezone.make_aware(
            dateparser.parse(review.get('date_complete'))) if review.get(
                'date_complete') else None
        date_confirmed = timezone.make_aware(
            dateparser.parse(review.get('date_confirmed'))) if review.get(
                'date_confirmed') else None

        # If the review was declined, setup a date declined date stamp
        review.get('declined')
        if review.get('declined') == '1':
            date_declined = date_confirmed
            date_accepted = None
            date_complete = date_confirmed
        else:
            date_accepted = date_confirmed
            date_declined = None

        new_review = review_models.ReviewAssignment.objects.create(
            article=article,
            reviewer=reviewer,
            review_round=round,
            review_type='traditional',
            visibility='double-blind',
            date_due=date_due,
            date_requested=date_requested,
            date_complete=date_complete,
            date_accepted=date_accepted,
            access_code=uuid.uuid4(),
            form=form)

        if review.get('declined') or review.get('recommendation'):
            new_review.is_complete = True

        if review.get('recommendation'):
            new_review.decision = map_review_recommendation(
                review.get('recommendation'))

        if review.get('review_file_url'):
            filename, mime = shared.fetch_file(base_url,
                                               review.get('review_file_url'),
                                               None,
                                               None,
                                               article,
                                               None,
                                               handle_images=False,
                                               auth_file=auth_file)
            extension = os.path.splitext(filename)[1]

            review_file = shared.add_file(mime,
                                          extension,
                                          'Reviewer file',
                                          reviewer,
                                          filename,
                                          article,
                                          galley=False)
            new_review.review_file = review_file

        if review.get('comments'):
            filepath = core_files.create_temp_file(review.get('comments'),
                                                   'comment.txt')
            file = open(filepath, 'r')
            comment_file = core_files.save_file_to_article(
                file,
                article,
                article.owner,
                label='Review Comments',
                save=False)

            new_review.review_file = comment_file

        new_review.save()

    # Get MS File
    ms_file = get_ojs_file(base_url, article_dict.get('manuscript_file_url'),
                           article, auth_file, 'MS File')
    article.manuscript_files.add(ms_file)

    # Get RV File
    rv_file = get_ojs_file(base_url, article_dict.get('review_file_url'),
                           article, auth_file, 'RV File')
    round.review_files.add(rv_file)

    # Get Supp Files
    if article_dict.get('supp_files'):
        for file in article_dict.get('supp_files'):
            file = get_ojs_file(base_url, file.get('url'), article, auth_file,
                                file.get('title'))
            article.data_figure_files.add(file)

    article.save()
    round.save()

    return article

示例#3

显示文件

def import_article(journal, user, url, thumb_path=None):
    """ Import a Ubiquity Press article.

    :param journal: the journal to import to
    :param user: the user who will own the file
    :param url: the URL of the article to import
    :param thumb_path: the base path for thumbnails
    :return: None
    """

    # retrieve the remote page and establish if it has a DOI
    already_exists, doi, domain, soup_object = shared.fetch_page_and_check_if_exists(
        url)
    requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

    if already_exists:
        # if here then this article has already been imported
        return

    # fetch basic metadata
    new_article = shared.get_and_set_metadata(journal, soup_object, user,
                                              False, True)

    # try to do a license lookup
    pattern = re.compile(r'creativecommons')
    license_tag = soup_object.find(href=pattern)
    license_object = models.Licence.objects.filter(
        url=license_tag['href'].replace('http:', 'https:'), journal=journal)

    if len(license_object) > 0 and license_object[0] is not None:
        license_object = license_object[0]
        logger.info("Found a license for this article: {0}".format(
            license_object.short_name))
    else:
        license_object = models.Licence.objects.get(name='All rights reserved',
                                                    journal=journal)
        logger.warning(
            "Did not find a license for this article. Using: {0}".format(
                license_object.short_name))

    new_article.license = license_object

    # determine if the article is peer reviewed
    peer_reviewed = soup_object.find(name='a',
                                     text='Peer Reviewed') is not None
    logger.debug("Peer reviewed: {0}".format(peer_reviewed))

    new_article.peer_reviewed = peer_reviewed

    # get PDF and XML galleys
    pdf = shared.get_pdf_url(soup_object)

    # rip XML out if found
    pattern = re.compile('.*?XML.*')
    xml = soup_object.find('a', text=pattern)
    html = None

    if xml:
        logger.info("Ripping XML")
        xml = xml.get('href', None).strip()
    else:
        # looks like there isn't any XML
        # instead we'll pull out any div with an id of "xml-article" and add as an HTML galley
        logger.info("Ripping HTML")
        html = soup_object.find('div', attrs={'id': 'xml-article'})

        if html:
            html = str(html.contents[0])

    # attach the galleys to the new article
    galleys = {'PDF': pdf, 'XML': xml, 'HTML': html}

    shared.set_article_galleys_and_identifiers(doi, domain, galleys,
                                               new_article, url, user)

    # fetch thumbnails
    if thumb_path is not None:
        logger.info("Attempting to assign thumbnail.")

        final_path_element = url.split('/')[-1]
        id_regex = re.compile(r'.*?(\d+)')
        matches = id_regex.match(final_path_element)
        article_id = matches.group(1)

        logger.info("Determined remote article ID as: {0}".format(article_id))
        logger.info("Thumbnail path: {thumb_path}, URL: {url}".format(
            thumb_path=thumb_path, url=url))

        try:
            filename, mime = shared.fetch_file(domain,
                                               thumb_path + "/" + article_id,
                                               "", 'graphic', new_article,
                                               user)
            shared.add_file(mime,
                            'graphic',
                            'Thumbnail',
                            user,
                            filename,
                            new_article,
                            thumbnail=True)
        except Exception as e:
            logger.warning("Unable to import thumbnail: %s" % e)

    # lookup status
    stats = soup_object.findAll('div', {'class': 'stat-number'})

    # save the article to the database
    new_article.save()

    try:
        if stats:
            from metrics import models as metrics_models
            views = stats[0].contents[0]
            if len(stats) > 1:
                downloads = stats[1].contents[0]
            else:
                downloads = 0

            metrics_models.HistoricArticleAccess.objects.create(
                article=new_article, views=views, downloads=downloads)
    except (IndexError, AttributeError):
        logger.info("No article metrics found")

示例#4

显示文件

文件： up.py 项目： pixelandpen/janeway

def import_article(journal, user, url, thumb_path=None):
    """ Import a Ubiquity Press article.

    :param journal: the journal to import to
    :param user: the user who will own the file
    :param url: the URL of the article to import
    :param thumb_path: the base path for thumbnails
    :return: None
    """

    # retrieve the remote page and establish if it has a DOI
    already_exists, doi, domain, soup_object = shared.fetch_page_and_check_if_exists(
        url)
    requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

    if already_exists:
        # if here then this article has already been imported
        return

    # fetch basic metadata
    new_article = shared.get_and_set_metadata(journal, soup_object, user,
                                              False, True)

    # try to do a license lookup
    pattern = re.compile(r'creativecommons')
    license_tag = soup_object.find(href=pattern)
    license_object = models.Licence.objects.filter(
        url=license_tag['href'].replace('http:', 'https:'), journal=journal)

    if len(license_object) > 0 and license_object[0] is not None:
        license_object = license_object[0]
        print("Found a license for this article: {0}".format(
            license_object.short_name))
    else:
        license_object = models.Licence.objects.get(name='All rights reserved',
                                                    journal=journal)
        print("Did not find a license for this article. Using: {0}".format(
            license_object.short_name))

    new_article.license = license_object

    # determine if the article is peer reviewed
    peer_reviewed = soup_object.find(name='a',
                                     text='Peer Reviewed') is not None
    print("Peer reviewed: {0}".format(peer_reviewed))

    new_article.peer_reviewed = peer_reviewed

    # get PDF and XML galleys
    pdf = shared.get_pdf_url(soup_object)

    # rip XML out if found
    pattern = re.compile('.*?XML.*')
    xml = soup_object.find('a', text=pattern)
    html = None

    if xml:
        print("Ripping XML")
        xml = xml.get('href', None).strip()
    else:
        # looks like there isn't any XML
        # instead we'll pull out any div with an id of "xml-article" and add as an HTML galley
        print("Ripping HTML")
        html = soup_object.find('div', attrs={'id': 'xml-article'})

        if html:
            html = str(html.contents[0])

    # attach the galleys to the new article
    galleys = {'PDF': pdf, 'XML': xml, 'HTML': html}

    shared.set_article_galleys_and_identifiers(doi, domain, galleys,
                                               new_article, url, user)

    # fetch thumbnails
    if thumb_path is not None:
        print("Attempting to assign thumbnail.")

        id_regex = re.compile(r'.*?(\d+)')
        matches = id_regex.match(url)

        article_id = matches.group(1)

        print("Determined remote article ID as: {0}".format(article_id))

        try:
            filename, mime = shared.fetch_file(domain,
                                               thumb_path + "/" + article_id,
                                               "", 'graphic', new_article,
                                               user)
            shared.add_file(mime,
                            'graphic',
                            'Thumbnail',
                            user,
                            filename,
                            new_article,
                            thumbnail=True)
        except BaseException:
            print("Unable to import thumbnail. Recoverable error.")

    # try to do a license lookup

    # save the article to the database
    new_article.save()

示例#5

显示文件

def import_issue_images(journal, user, url, import_missing=False):
    """ Imports all issue images and other issue related content
    Currently also reorders all issues, articles and sections within issues,
    article thumbnails and issue titles.
    :param journal: a journal.models.Journal
    :param user: the owner of the imported content as a core.models.Account
    :param url: the base url of the journal to import from
    :param load_missing: Bool. If true, attempt to import missing articles
    """
    base_url = url

    if not url.endswith('/issue/archive/'):
        url += '/issue/archive/'

    requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

    resp, mime = utils_models.ImportCacheEntry.fetch(url=url)

    soup = BeautifulSoup(resp, 'lxml')

    from django.conf import settings
    import os
    from django.core.files import File

    for issue in journal.issues():
        issue_num = issue.issue
        pattern = re.compile(r'\/\d+\/volume\/{0}\/issue\/{1}'.format(
            issue.volume, issue_num))

        img_url_suffix = soup.find(src=pattern)

        if img_url_suffix:
            img_url = base_url + img_url_suffix.get('src')
            logger.info("Fetching {0}".format(img_url))

            resp, mime = utils_models.ImportCacheEntry.fetch(url=img_url)

            path = os.path.join(settings.BASE_DIR, 'files', 'journals',
                                str(journal.id))

            os.makedirs(path, exist_ok=True)

            path = os.path.join(
                path,
                'volume{0}_issue_{0}.graphic'.format(issue.volume, issue_num))

            with open(path, 'wb') as f:
                f.write(resp)

            with open(path, 'rb') as f:
                issue.cover_image.save(path, File(f))

            sequence_pattern = re.compile(
                r'.*?(\d+)\/volume\/{0}\/issue\/{1}.*'.format(
                    issue.volume, issue_num))

            issue.order = int(sequence_pattern.match(img_url).group(1))

            logger.info(
                "Setting Volume {0}, Issue {1} sequence to: {2}".format(
                    issue.volume, issue_num, issue.order))

            logger.info("Extracting section orders within the issue...")

            new_url = '/{0}/volume/{1}/issue/{2}/'.format(
                issue.order, issue.volume, issue_num)
            resp, mime = utils_models.ImportCacheEntry.fetch(url=base_url +
                                                             new_url)

            soup_issue = BeautifulSoup(resp, 'lxml')

            sections_to_order = soup_issue.find_all(
                name='h2', attrs={'class': 'main-color-text'})
            # Find issue title
            try:
                issue_title = soup_issue.find("div", {
                    "class": "multi-inline"
                }).find("h1").string
                issue_title = issue_title.strip(" -\n")
                if issue.issue_title and issue_title not in issue.issue_title:
                    issue.issue_title = "{} - {}".format(
                        issue_title, issue.issue_title)
                else:
                    issue.issue_title = issue_title
            except AttributeError as e:
                logger.debug("Couldn't find an issue title: %s" % e)

            #Find issue description
            try:
                desc_parts = soup_issue.find("div", {
                    "class": "article-type-list-block"
                }).findAll("p", {"class": "p1"})
                issue.issue_description = "\n".join(str(p) for p in desc_parts)
            except AttributeError as e:
                logger.debug("Couldn't extract an issue description %s" % e)

            sections_to_order = soup_issue.find_all(
                name='h2', attrs={'class': 'main-color-text'})

            # delete existing order models for sections for this issue
            journal_models.SectionOrdering.objects.filter(issue=issue).delete()

            for section_order, section in enumerate(sections_to_order):

                logger.info('[{0}] {1}'.format(section_order,
                                               section.getText()))
                order_section, c = models.Section.objects.language(
                    'en').get_or_create(name=section.getText().strip(),
                                        journal=journal)
                journal_models.SectionOrdering.objects.create(
                    issue=issue, section=order_section,
                    order=section_order).save()

            logger.info("Extracting article orders within the issue...")

            # delete existing order models for issue
            journal_models.ArticleOrdering.objects.filter(issue=issue).delete()

            pattern = re.compile(r'\/articles\/(.+?)/(.+?)/')
            articles = soup_issue.find_all(href=pattern)

            article_order = 0

            processed = []

            for article_link in articles:
                # parse the URL into a DOI and prefix
                article_url = article_link["href"]
                match = pattern.match(article_url)
                prefix = match.group(1)
                doi = match.group(2)

                # get a proper article object
                article = models.Article.get_article(
                    journal, 'doi', '{0}/{1}'.format(prefix, doi))
                if not article and import_missing:
                    logger.debug("Article %s not found, importing...",
                                 article_url)
                    import_article(journal, user, base_url + article_url)

                if article and article not in processed:
                    thumb_img = article_link.find("img")
                    if thumb_img:
                        thumb_path = thumb_img["src"]
                        filename, mime = shared.fetch_file(
                            base_url,
                            thumb_path,
                            "",
                            'graphic',
                            article,
                            user,
                        )
                        shared.add_file(
                            mime,
                            'graphic',
                            'Thumbnail',
                            user,
                            filename,
                            article,
                            thumbnail=True,
                        )
                    journal_models.ArticleOrdering.objects.get_or_create(
                        issue=issue,
                        article=article,
                        section=article.section,
                        order=article_order,
                    )

                    article_order += 1

                processed.append(article)
            issue.save()