def handle(self, *args, **options): """Imports a set of DOIs into . :param args: None :param options: Dictionary containing 'input_file', which should contain a list of DOIs :return: None """ dict = csv.DictReader(open(options['input_file'], 'r')) with open(options['output_file'], 'w') as out_file: out = csv.writer(out_file) out.writerow(['DOI', '<item crawler="iParadigms">']) for row in dict: if 'amazonaws' not in row['URL'] and 'uwp.co.uk' not in row['URL']: page = shared.fetch_page(row['URL']) pdf_url = shared.get_pdf_url(page) out.writerow([row['DOI'], pdf_url]) out_file.flush()
def import_article(journal, user, url, thumb_path=None): """ Import a Ubiquity Press article. :param journal: the journal to import to :param user: the user who will own the file :param url: the URL of the article to import :param thumb_path: the base path for thumbnails :return: None """ # retrieve the remote page and establish if it has a DOI already_exists, doi, domain, soup_object = shared.fetch_page_and_check_if_exists( url) requests.packages.urllib3.disable_warnings(InsecureRequestWarning) if already_exists: # if here then this article has already been imported return # fetch basic metadata new_article = shared.get_and_set_metadata(journal, soup_object, user, False, True) # try to do a license lookup pattern = re.compile(r'creativecommons') license_tag = soup_object.find(href=pattern) license_object = models.Licence.objects.filter( url=license_tag['href'].replace('http:', 'https:'), journal=journal) if len(license_object) > 0 and license_object[0] is not None: license_object = license_object[0] logger.info("Found a license for this article: {0}".format( license_object.short_name)) else: license_object = models.Licence.objects.get(name='All rights reserved', journal=journal) logger.warning( "Did not find a license for this article. Using: {0}".format( license_object.short_name)) new_article.license = license_object # determine if the article is peer reviewed peer_reviewed = soup_object.find(name='a', text='Peer Reviewed') is not None logger.debug("Peer reviewed: {0}".format(peer_reviewed)) new_article.peer_reviewed = peer_reviewed # get PDF and XML galleys pdf = shared.get_pdf_url(soup_object) # rip XML out if found pattern = re.compile('.*?XML.*') xml = soup_object.find('a', text=pattern) html = None if xml: logger.info("Ripping XML") xml = xml.get('href', None).strip() else: # looks like there isn't any XML # instead we'll pull out any div with an id of "xml-article" and add as an HTML galley logger.info("Ripping HTML") html = soup_object.find('div', attrs={'id': 'xml-article'}) if html: html = str(html.contents[0]) # attach the galleys to the new article galleys = {'PDF': pdf, 'XML': xml, 'HTML': html} shared.set_article_galleys_and_identifiers(doi, domain, galleys, new_article, url, user) # fetch thumbnails if thumb_path is not None: logger.info("Attempting to assign thumbnail.") final_path_element = url.split('/')[-1] id_regex = re.compile(r'.*?(\d+)') matches = id_regex.match(final_path_element) article_id = matches.group(1) logger.info("Determined remote article ID as: {0}".format(article_id)) logger.info("Thumbnail path: {thumb_path}, URL: {url}".format( thumb_path=thumb_path, url=url)) try: filename, mime = shared.fetch_file(domain, thumb_path + "/" + article_id, "", 'graphic', new_article, user) shared.add_file(mime, 'graphic', 'Thumbnail', user, filename, new_article, thumbnail=True) except Exception as e: logger.warning("Unable to import thumbnail: %s" % e) # lookup status stats = soup_object.findAll('div', {'class': 'stat-number'}) # save the article to the database new_article.save() try: if stats: from metrics import models as metrics_models views = stats[0].contents[0] if len(stats) > 1: downloads = stats[1].contents[0] else: downloads = 0 metrics_models.HistoricArticleAccess.objects.create( article=new_article, views=views, downloads=downloads) except (IndexError, AttributeError): logger.info("No article metrics found")
def import_article(journal, user, url, thumb_path=None): """ Import a Ubiquity Press article. :param journal: the journal to import to :param user: the user who will own the file :param url: the URL of the article to import :param thumb_path: the base path for thumbnails :return: None """ # retrieve the remote page and establish if it has a DOI already_exists, doi, domain, soup_object = shared.fetch_page_and_check_if_exists( url) requests.packages.urllib3.disable_warnings(InsecureRequestWarning) if already_exists: # if here then this article has already been imported return # fetch basic metadata new_article = shared.get_and_set_metadata(journal, soup_object, user, False, True) # try to do a license lookup pattern = re.compile(r'creativecommons') license_tag = soup_object.find(href=pattern) license_object = models.Licence.objects.filter( url=license_tag['href'].replace('http:', 'https:'), journal=journal) if len(license_object) > 0 and license_object[0] is not None: license_object = license_object[0] print("Found a license for this article: {0}".format( license_object.short_name)) else: license_object = models.Licence.objects.get(name='All rights reserved', journal=journal) print("Did not find a license for this article. Using: {0}".format( license_object.short_name)) new_article.license = license_object # determine if the article is peer reviewed peer_reviewed = soup_object.find(name='a', text='Peer Reviewed') is not None print("Peer reviewed: {0}".format(peer_reviewed)) new_article.peer_reviewed = peer_reviewed # get PDF and XML galleys pdf = shared.get_pdf_url(soup_object) # rip XML out if found pattern = re.compile('.*?XML.*') xml = soup_object.find('a', text=pattern) html = None if xml: print("Ripping XML") xml = xml.get('href', None).strip() else: # looks like there isn't any XML # instead we'll pull out any div with an id of "xml-article" and add as an HTML galley print("Ripping HTML") html = soup_object.find('div', attrs={'id': 'xml-article'}) if html: html = str(html.contents[0]) # attach the galleys to the new article galleys = {'PDF': pdf, 'XML': xml, 'HTML': html} shared.set_article_galleys_and_identifiers(doi, domain, galleys, new_article, url, user) # fetch thumbnails if thumb_path is not None: print("Attempting to assign thumbnail.") id_regex = re.compile(r'.*?(\d+)') matches = id_regex.match(url) article_id = matches.group(1) print("Determined remote article ID as: {0}".format(article_id)) try: filename, mime = shared.fetch_file(domain, thumb_path + "/" + article_id, "", 'graphic', new_article, user) shared.add_file(mime, 'graphic', 'Thumbnail', user, filename, new_article, thumbnail=True) except BaseException: print("Unable to import thumbnail. Recoverable error.") # try to do a license lookup # save the article to the database new_article.save()