def handle(self, *args, **options):
        self.verbosity = int(options['verbosity'])    # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1

        # counters
        self.counts = defaultdict(int)

        # connection to repository
        repo = ManagementRepository()

        # Symplectic-Elements setup
        self.session = requests.Session()
        self.session.auth = (settings.SYMPLECTIC_USER, settings.SYMPLECTIC_PASSWORD)
        self.session.verify=False
        self.session.stream=True
        self.session.headers.update({'Content-Type': 'text/xml'})

        self.pub_query_url = "%s/%s" % (settings.SYMPLECTIC_BASE_URL, "publications")
        self.pub_create_url = "%s/%s" % (settings.SYMPLECTIC_BASE_URL, "publication/records/manual")
        self.relation_create_url = "%s/%s" % (settings.SYMPLECTIC_BASE_URL, "relationships")

        # if pids specified, use that list
        try:
            if len(args) != 0:
                pids = list(args)
                pid_set = [repo.get_object(pid=p,type=Publication) for p in pids]


            else:
                #search for Articles.
                pid_set = repo.get_objects_with_cmodel(Publication.ARTICLE_CONTENT_MODEL, Article)

        except Exception as e:
            raise CommandError('Error getting pid list (%s)' % e.message)

        try:
            articles = Paginator(pid_set, 20)
            self.counts['total'] = articles.count
        except Exception as e:
            self.output(0, "Error paginating items: : %s " % (e.message))

        #process all Articles
        for p in articles.page_range:
            try:
                objs = articles.page(p).object_list
            except Exception as e:
                #print error and go to next iteration of loop
                self.output(0,"Error getting page: %s : %s " % (p, e.message))
                self.counts['errors'] +=1
                continue
            for article in objs:
                try:
                    # if not article.exists:
                    #     self.output(1, "Skipping %s because pid does not exist" % article.pid)
                    #     self.counts['skipped'] +=1
                    #     continue
                    # title = article.descMetadata.content.title_info.title if (article.descMetadata.content.title_info and article.descMetadata.content.title_info.title) else None
                    # if title is None or title == '':
                    #     self.output(1, "Skipping %s because OE Title does not exist" % (article.pid))
                    #     self.counts['skipped'] +=1
                    #     continue

                    # if not article.is_published:
                    #     self.output(1, "Skipping %s because pid is not published" % article.pid)
                    #     self.counts['skipped'] +=1
                    #     continue

                    # # try to detect article by PMC
                    # if article.pmcid and not options['force']:
                    #     response = self.session.get(self.pub_query_url, params = {'query' : 'external-identifiers.pmc="PMC%s"' % article.pmcid, 'detail': 'full'})
                    #     entries = load_xmlobject_from_string(response.raw.read(), OESympImportPublication).entries
                    #     self.output(2, "Query for PMC Match: GET %s %s" % (response.url, response.status_code))
                    #     if response.status_code == 200:
                    #         if len(entries) >= 1:
                    #             self.output(1, "Skipping %s because PMC PMC%s already exists" % (article.pid, article.pmcid))
                    #             self.counts['skipped'] +=1

                    #             if options['rel']:
                    #                 symp_pub, relations = article.as_symp(source=entries[0].source, source_id=entries[0].source_id)
                    #                 self.process_relations(entries[0].source_id, relations, options)
                    #                 sleep(1)
                    #             continue
                    #     else:
                    #         self.output(1, "Skipping %s because trouble with request %s %s" % (article.pid, response.status_code, entries[0].title))
                    #         self.counts['skipped'] +=1
                    #         continue

                    # # try to detect article by Title if it does not have PMC
                    # if not options['force']:
                    #     response = self.session.get(self.pub_query_url, params = {'query' : 'title~"%s"' % title, 'detail': 'full'})
                    #     entries = load_xmlobject_from_string(response.raw.read(), OESympImportPublication).entries
                    #     # Accouont for mutiple results
                    #     titles = [e.title for e in entries]
                    #     self.output(2, "Query for Title Match: GET %s %s" % (response.url, response.status_code))
                    #     if response.status_code == 200:
                    #         found = False
                    #         for t in titles:
                    #             success, percent = percent_match(title, t, 90)
                    #             self.output(1, "Percent Title Match '%s' '%s' %s " % (title, t, percent))
                    #             if success:
                    #                 found = True
                    #         if found:
                    #             self.output(1, "Skipping %s because Title \"%s\" already exists" % (article.pid, title))
                    #             self.counts['skipped'] +=1

                    #             # update relations if rel is set
                    #             if options['rel']:
                    #                 symp_pub, relations = article.as_symp(source=entries[0].source, source_id=entries[0].source_id)
                    #                 self.process_relations(entries[0].source_id, relations, options)
                    #                 sleep(1)
                    #             continue
                    #     else:
                    #         self.output(1, "Skipping %s because trouble with request %s %s" % (article.pid, response.status_code, entries[0].title))
                    #         self.counts['skipped'] +=1
                    #         continue

                    # Process article and relations
                    symp_pub, relations = article.as_symp()
                    self.process_article(article.pid, symp_pub, options)
                    self.process_relations(article.pid, relations, options)
                    sleep(1)

                except Exception as e:
                    self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message))
                    import traceback
                    traceback.print_exc()
                    self.counts['errors'] +=1

        # summarize what was done
        self.stdout.write("\n\n")
        self.stdout.write("Total number selected: %s\n" % self.counts['total'])
        self.stdout.write("Skipped: %s\n" % self.counts['skipped'])
        self.stdout.write("Errors: %s\n" % self.counts['errors'])
        self.stdout.write("Warnings: %s\n" % self.counts['warnings'])
        self.stdout.write("Articles Processed: %s\n" % self.counts['articles_processed'])
        self.stdout.write("Relations Processed: %s\n" % self.counts['relations_processed'])
예제 #2
0
class Command(BaseCommand):
    ''' This command run through all the articles and makes sure that journal titles and publishers match against Sherpa Romeo
    '''
    args = "[netid netid ...]"
    help = __doc__

    option_list = BaseCommand.option_list + (
        make_option('--noact', '-n',
                    action='store_true',
                    default=False,
                    help='Fixed all caps title in articles'),
        )

    def handle(self, *args, **options):

        self.verbosity = int(options['verbosity'])    # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1

        # connection to repository
        self.repo = ManagementRepository()
        pid_set = self.repo.get_objects_with_cmodel(Publication.ARTICLE_CONTENT_MODEL, type=Publication)


        try:
            articles = Paginator(pid_set, 100)

        except Exception as e:
            self.output(0, "Error paginating items: : %s " % (e.message))

        #process all Articles
        for p in articles.page_range:
            try:
                objs = articles.page(p).object_list
            except Exception as e:
                #print error and go to next iteration of loop
                self.output(0,"Error getting page: %s : %s " % (p, e.message))
                continue
            for article in objs:
                try:
                    if not article.exists:
                        self.output(0, "Skipping %s because pid does not exist" % article.pid)
                        continue
                    else:
                        mods = article.descMetadata.content
                        if mods.journal is not None:
                            if mods.journal.title is not None:
                                try:
                                    journals = romeo.search_journal_title(mods.journal.title, type='starts') if mods.journal.title else []
                                    suggestions = [journal_suggestion_data(journal) for journal in journals]
                                    if mods.journal.title.lower() in map(str.lower, JOURNAL_LIST):
                                        mods.journal.title = suggestions[0]['value']
                                        print "JOURNAL"
                                        print mods.journal.title
                                        article.save()
                                    else:
                                        continue

                                except:
                                    suggestions = []

                            # if mods.journal.publisher is not None:
                            #     try:
                            #         publishers = romeo.search_publisher_name(mods.journal.publisher, versions='all')
                            #         suggestions = [publisher_suggestion_data(pub) for pub in publishers]
                            #         mods.journal.publisher = suggestions[0]['value']
                            #         print "PUBLISHER"
                            #         print mods.journal.publisher
                            #     except:
                            #         suggestions = []

                        else:
                            continue


                except Exception as e:
                    self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message))
                    # self.counts['errors'] +=1


    def output(self, v, msg):
        '''simple function to handle logging output based on verbosity'''
        if self.verbosity >= v:
            self.stdout.write("%s\n" % msg)
예제 #3
0
class Command(BaseCommand):
    ''' This command run through all the articles and makes sure that journal titles and publishers match against Sherpa Romeo
    '''
    args = "[netid netid ...]"
    help = __doc__

    option_list = BaseCommand.option_list + (
        make_option('--noact', '-n',
                    action='store_true',
                    default=False,
                    help='Fixed all caps title in articles'),
        )

    def handle(self, *args, **options):

        self.verbosity = int(options['verbosity'])    # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1


        # connection to repository
        self.repo = ManagementRepository()
        pid_set = self.repo.get_objects_with_cmodel(Publication.ARTICLE_CONTENT_MODEL, type=Publication)
        writer = csv.writer(open("publications_csv.csv", 'wb'))
        writer.writerow([
            smart_str(u"PID"),
            smart_str(u"Title"),
            smart_str(u"Withdrawn"),
            smart_str(u"Authors"),
            smart_str(u"Journal Title"),
            smart_str(u"Publisher"),
            smart_str(u"Version"),
            smart_str(u"Final Published Link"),
            smart_str(u"DOI"),
            smart_str(u"Subjects"),
            smart_str(u"Funding Group"),
            smart_str(u"CC License"),
            smart_str(u"Copyright Statement"),
            smart_str(u"Admin Note"),
            smart_str(u"Date Reviewed"),
            smart_str(u"Rights Research Date"),
            smart_str(u"PMC"),
            smart_str(u"PUBSID"),
            smart_str(u"File Deposited"),

        ])

        try:
            articles = Paginator(pid_set, 100)

        except Exception as e:
            self.output(0, "Error paginating items: : %s " % (e.message))

        #process all Articles
        for p in articles.page_range:
            try:
                objs = articles.page(p).object_list
            except Exception as e:
                #print error and go to next iteration of loop
                self.output(0,"Error getting page: %s : %s " % (p, e.message))
                continue
            for article in objs:
                try:
                    if not article.exists:
                        self.output(0, "Skipping %s because pid does not exist" % article.pid)
                        continue
                    else:
                        mods = article.descMetadata.content
                        symp = article.sympAtom.content
                        authors = []
                        subjects = []
                        funders = []
                        for author in mods.authors:
                            authors.append('%s %s' % (author.given_name, author.family_name))
                        for subject in mods.subjects:
                            subjects.append(subject.topic)
                        for funder in mods.funders:
                            funders.append(funder.name)

                        writer.writerow([
                            smart_str(article.pid if article.pid else ''),
                            smart_str(article.label if article.label else ''),
                            smart_str(article.is_withdrawn),
                            smart_str(",".join(authors)),
                            smart_str(mods.journal.title if mods.journal else ''),
                            smart_str(mods.journal.publisher if mods.journal else ''),
                            smart_str(mods.version if mods.version else ''),
                            smart_str(mods.final_version.url if mods.final_version else ''),
                            smart_str(mods.final_version.doi if mods.final_version else ''),
                            smart_str(",".join(subjects)),
                            smart_str(",".join(funders)),
                            smart_str(mods.license.text if mods.license else ''),
                            smart_str(mods.copyright.text if mods.copyright else ''),
                            smart_str(mods.admin_note.text if mods.admin_note else ''),
                            smart_str(article.provenance.content.date_reviewed if article.provenance else ''),
                            smart_str(mods.rights_research_date if mods.rights_research_date else ''),
                            smart_str(article.pmcid if article.pmcid else ''),
                            smart_str(symp.pubs_id if symp else ''),
                            smart_str("Yes" if article.pdf.exists else 'No'),


                        ])

                except Exception as e:
                    self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message))
                    # self.counts['errors'] +=1
        writer.close()


    def output(self, v, msg):
        '''simple function to handle logging output based on verbosity'''
        if self.verbosity >= v:
            self.stdout.write("%s\n" % msg)
예제 #4
0
class Command(BaseCommand):
    ''' This command run through all the articles and makes sure that journal titles and publishers match against Sherpa Romeo
    '''
    args = "[netid netid ...]"
    help = __doc__

    option_list = BaseCommand.option_list + (
        make_option('--article', '-a',
                    action='store_true',
                    default=False,
                    help='Cleans up content models for articles.'),
        make_option('--book', '-b',
                    action='store',
                    default=False,
                    help='Cleans up content models for books.'),
        make_option('--force', '-f',
                    action='store_true',
                    default=False,
                    help='Updates even if SYMPLECTIC-ATOM has not been modified since last run.'),
        )

    def handle(self, *args, **options):

        self.verbosity = int(options['verbosity'])    # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1

        if options['article'] and options['book']:
            raise CommandError('Can not use both parameters')

        if not options['article'] and not options['book']:
            raise CommandError('Use at least one parameter')

        if options['article']:
            cmodel = Publication.ARTICLE_CONTENT_MODEL

        if options['book']:
           cmodel = Publication.BOOK_CONTENT_MODEL

        # connection to repository
        self.repo = ManagementRepository()
        pid_set = self.repo.get_objects_with_cmodel(cmodel, type=Publication)

        try:
            publications = Paginator(pid_set, 100)

        except Exception as e:
            self.output(0, "Error paginating items: : %s " % (e.message))

        #process all Articles
        for p in publications.page_range:
            try:
                objs = publications.page(p).object_list
            except Exception as e:
                #print error and go to next iteration of loop
                self.output(0,"Error getting page: %s : %s " % (p, e.message))
                continue
            for publication in objs:
                try:
                    if not publication.exists:
                        self.output(0, "Skipping %s because pid does not exist" % publication.pid)
                        continue
                    else:
                        if not publication.has_model(Publication.PUBLICATION_CONTENT_MODEL):
                            publication.add_relationship(relsextns.hasModel, Publication.PUBLICATION_CONTENT_MODEL)
                            publication.save()
                    else:
                        continue


                except Exception as e:
                    self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message))