def handle(self, *args, **options): self.verbosity = int(options['verbosity']) # 1 = normal, 0 = minimal, 2 = all self.v_normal = 1 # counters self.counts = defaultdict(int) # connection to repository repo = ManagementRepository() # Symplectic-Elements setup self.session = requests.Session() self.session.auth = (settings.SYMPLECTIC_USER, settings.SYMPLECTIC_PASSWORD) self.session.verify=False self.session.stream=True self.session.headers.update({'Content-Type': 'text/xml'}) self.pub_query_url = "%s/%s" % (settings.SYMPLECTIC_BASE_URL, "publications") self.pub_create_url = "%s/%s" % (settings.SYMPLECTIC_BASE_URL, "publication/records/manual") self.relation_create_url = "%s/%s" % (settings.SYMPLECTIC_BASE_URL, "relationships") # if pids specified, use that list try: if len(args) != 0: pids = list(args) pid_set = [repo.get_object(pid=p,type=Publication) for p in pids] else: #search for Articles. pid_set = repo.get_objects_with_cmodel(Publication.ARTICLE_CONTENT_MODEL, Article) except Exception as e: raise CommandError('Error getting pid list (%s)' % e.message) try: articles = Paginator(pid_set, 20) self.counts['total'] = articles.count except Exception as e: self.output(0, "Error paginating items: : %s " % (e.message)) #process all Articles for p in articles.page_range: try: objs = articles.page(p).object_list except Exception as e: #print error and go to next iteration of loop self.output(0,"Error getting page: %s : %s " % (p, e.message)) self.counts['errors'] +=1 continue for article in objs: try: # if not article.exists: # self.output(1, "Skipping %s because pid does not exist" % article.pid) # self.counts['skipped'] +=1 # continue # title = article.descMetadata.content.title_info.title if (article.descMetadata.content.title_info and article.descMetadata.content.title_info.title) else None # if title is None or title == '': # self.output(1, "Skipping %s because OE Title does not exist" % (article.pid)) # self.counts['skipped'] +=1 # continue # if not article.is_published: # self.output(1, "Skipping %s because pid is not published" % article.pid) # self.counts['skipped'] +=1 # continue # # try to detect article by PMC # if article.pmcid and not options['force']: # response = self.session.get(self.pub_query_url, params = {'query' : 'external-identifiers.pmc="PMC%s"' % article.pmcid, 'detail': 'full'}) # entries = load_xmlobject_from_string(response.raw.read(), OESympImportPublication).entries # self.output(2, "Query for PMC Match: GET %s %s" % (response.url, response.status_code)) # if response.status_code == 200: # if len(entries) >= 1: # self.output(1, "Skipping %s because PMC PMC%s already exists" % (article.pid, article.pmcid)) # self.counts['skipped'] +=1 # if options['rel']: # symp_pub, relations = article.as_symp(source=entries[0].source, source_id=entries[0].source_id) # self.process_relations(entries[0].source_id, relations, options) # sleep(1) # continue # else: # self.output(1, "Skipping %s because trouble with request %s %s" % (article.pid, response.status_code, entries[0].title)) # self.counts['skipped'] +=1 # continue # # try to detect article by Title if it does not have PMC # if not options['force']: # response = self.session.get(self.pub_query_url, params = {'query' : 'title~"%s"' % title, 'detail': 'full'}) # entries = load_xmlobject_from_string(response.raw.read(), OESympImportPublication).entries # # Accouont for mutiple results # titles = [e.title for e in entries] # self.output(2, "Query for Title Match: GET %s %s" % (response.url, response.status_code)) # if response.status_code == 200: # found = False # for t in titles: # success, percent = percent_match(title, t, 90) # self.output(1, "Percent Title Match '%s' '%s' %s " % (title, t, percent)) # if success: # found = True # if found: # self.output(1, "Skipping %s because Title \"%s\" already exists" % (article.pid, title)) # self.counts['skipped'] +=1 # # update relations if rel is set # if options['rel']: # symp_pub, relations = article.as_symp(source=entries[0].source, source_id=entries[0].source_id) # self.process_relations(entries[0].source_id, relations, options) # sleep(1) # continue # else: # self.output(1, "Skipping %s because trouble with request %s %s" % (article.pid, response.status_code, entries[0].title)) # self.counts['skipped'] +=1 # continue # Process article and relations symp_pub, relations = article.as_symp() self.process_article(article.pid, symp_pub, options) self.process_relations(article.pid, relations, options) sleep(1) except Exception as e: self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message)) import traceback traceback.print_exc() self.counts['errors'] +=1 # summarize what was done self.stdout.write("\n\n") self.stdout.write("Total number selected: %s\n" % self.counts['total']) self.stdout.write("Skipped: %s\n" % self.counts['skipped']) self.stdout.write("Errors: %s\n" % self.counts['errors']) self.stdout.write("Warnings: %s\n" % self.counts['warnings']) self.stdout.write("Articles Processed: %s\n" % self.counts['articles_processed']) self.stdout.write("Relations Processed: %s\n" % self.counts['relations_processed'])
class Command(BaseCommand): ''' This command run through all the articles and makes sure that journal titles and publishers match against Sherpa Romeo ''' args = "[netid netid ...]" help = __doc__ option_list = BaseCommand.option_list + ( make_option('--noact', '-n', action='store_true', default=False, help='Fixed all caps title in articles'), ) def handle(self, *args, **options): self.verbosity = int(options['verbosity']) # 1 = normal, 0 = minimal, 2 = all self.v_normal = 1 # connection to repository self.repo = ManagementRepository() pid_set = self.repo.get_objects_with_cmodel(Publication.ARTICLE_CONTENT_MODEL, type=Publication) try: articles = Paginator(pid_set, 100) except Exception as e: self.output(0, "Error paginating items: : %s " % (e.message)) #process all Articles for p in articles.page_range: try: objs = articles.page(p).object_list except Exception as e: #print error and go to next iteration of loop self.output(0,"Error getting page: %s : %s " % (p, e.message)) continue for article in objs: try: if not article.exists: self.output(0, "Skipping %s because pid does not exist" % article.pid) continue else: mods = article.descMetadata.content if mods.journal is not None: if mods.journal.title is not None: try: journals = romeo.search_journal_title(mods.journal.title, type='starts') if mods.journal.title else [] suggestions = [journal_suggestion_data(journal) for journal in journals] if mods.journal.title.lower() in map(str.lower, JOURNAL_LIST): mods.journal.title = suggestions[0]['value'] print "JOURNAL" print mods.journal.title article.save() else: continue except: suggestions = [] # if mods.journal.publisher is not None: # try: # publishers = romeo.search_publisher_name(mods.journal.publisher, versions='all') # suggestions = [publisher_suggestion_data(pub) for pub in publishers] # mods.journal.publisher = suggestions[0]['value'] # print "PUBLISHER" # print mods.journal.publisher # except: # suggestions = [] else: continue except Exception as e: self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message)) # self.counts['errors'] +=1 def output(self, v, msg): '''simple function to handle logging output based on verbosity''' if self.verbosity >= v: self.stdout.write("%s\n" % msg)
class Command(BaseCommand): ''' This command run through all the articles and makes sure that journal titles and publishers match against Sherpa Romeo ''' args = "[netid netid ...]" help = __doc__ option_list = BaseCommand.option_list + ( make_option('--noact', '-n', action='store_true', default=False, help='Fixed all caps title in articles'), ) def handle(self, *args, **options): self.verbosity = int(options['verbosity']) # 1 = normal, 0 = minimal, 2 = all self.v_normal = 1 # connection to repository self.repo = ManagementRepository() pid_set = self.repo.get_objects_with_cmodel(Publication.ARTICLE_CONTENT_MODEL, type=Publication) writer = csv.writer(open("publications_csv.csv", 'wb')) writer.writerow([ smart_str(u"PID"), smart_str(u"Title"), smart_str(u"Withdrawn"), smart_str(u"Authors"), smart_str(u"Journal Title"), smart_str(u"Publisher"), smart_str(u"Version"), smart_str(u"Final Published Link"), smart_str(u"DOI"), smart_str(u"Subjects"), smart_str(u"Funding Group"), smart_str(u"CC License"), smart_str(u"Copyright Statement"), smart_str(u"Admin Note"), smart_str(u"Date Reviewed"), smart_str(u"Rights Research Date"), smart_str(u"PMC"), smart_str(u"PUBSID"), smart_str(u"File Deposited"), ]) try: articles = Paginator(pid_set, 100) except Exception as e: self.output(0, "Error paginating items: : %s " % (e.message)) #process all Articles for p in articles.page_range: try: objs = articles.page(p).object_list except Exception as e: #print error and go to next iteration of loop self.output(0,"Error getting page: %s : %s " % (p, e.message)) continue for article in objs: try: if not article.exists: self.output(0, "Skipping %s because pid does not exist" % article.pid) continue else: mods = article.descMetadata.content symp = article.sympAtom.content authors = [] subjects = [] funders = [] for author in mods.authors: authors.append('%s %s' % (author.given_name, author.family_name)) for subject in mods.subjects: subjects.append(subject.topic) for funder in mods.funders: funders.append(funder.name) writer.writerow([ smart_str(article.pid if article.pid else ''), smart_str(article.label if article.label else ''), smart_str(article.is_withdrawn), smart_str(",".join(authors)), smart_str(mods.journal.title if mods.journal else ''), smart_str(mods.journal.publisher if mods.journal else ''), smart_str(mods.version if mods.version else ''), smart_str(mods.final_version.url if mods.final_version else ''), smart_str(mods.final_version.doi if mods.final_version else ''), smart_str(",".join(subjects)), smart_str(",".join(funders)), smart_str(mods.license.text if mods.license else ''), smart_str(mods.copyright.text if mods.copyright else ''), smart_str(mods.admin_note.text if mods.admin_note else ''), smart_str(article.provenance.content.date_reviewed if article.provenance else ''), smart_str(mods.rights_research_date if mods.rights_research_date else ''), smart_str(article.pmcid if article.pmcid else ''), smart_str(symp.pubs_id if symp else ''), smart_str("Yes" if article.pdf.exists else 'No'), ]) except Exception as e: self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message)) # self.counts['errors'] +=1 writer.close() def output(self, v, msg): '''simple function to handle logging output based on verbosity''' if self.verbosity >= v: self.stdout.write("%s\n" % msg)
class Command(BaseCommand): ''' This command run through all the articles and makes sure that journal titles and publishers match against Sherpa Romeo ''' args = "[netid netid ...]" help = __doc__ option_list = BaseCommand.option_list + ( make_option('--article', '-a', action='store_true', default=False, help='Cleans up content models for articles.'), make_option('--book', '-b', action='store', default=False, help='Cleans up content models for books.'), make_option('--force', '-f', action='store_true', default=False, help='Updates even if SYMPLECTIC-ATOM has not been modified since last run.'), ) def handle(self, *args, **options): self.verbosity = int(options['verbosity']) # 1 = normal, 0 = minimal, 2 = all self.v_normal = 1 if options['article'] and options['book']: raise CommandError('Can not use both parameters') if not options['article'] and not options['book']: raise CommandError('Use at least one parameter') if options['article']: cmodel = Publication.ARTICLE_CONTENT_MODEL if options['book']: cmodel = Publication.BOOK_CONTENT_MODEL # connection to repository self.repo = ManagementRepository() pid_set = self.repo.get_objects_with_cmodel(cmodel, type=Publication) try: publications = Paginator(pid_set, 100) except Exception as e: self.output(0, "Error paginating items: : %s " % (e.message)) #process all Articles for p in publications.page_range: try: objs = publications.page(p).object_list except Exception as e: #print error and go to next iteration of loop self.output(0,"Error getting page: %s : %s " % (p, e.message)) continue for publication in objs: try: if not publication.exists: self.output(0, "Skipping %s because pid does not exist" % publication.pid) continue else: if not publication.has_model(Publication.PUBLICATION_CONTENT_MODEL): publication.add_relationship(relsextns.hasModel, Publication.PUBLICATION_CONTENT_MODEL) publication.save() else: continue except Exception as e: self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message))