Exemplo n.º 1
0
    def handle(self, *args, **options):

        self.verbosity = int(options['verbosity'])    # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1

        if options['article'] and options['book']:
            raise CommandError('Can not use both parameters')

        if not options['article'] and not options['book']:
            raise CommandError('Use at least one parameter')

        if options['article']:
            cmodel = Publication.ARTICLE_CONTENT_MODEL

        if options['book']:
           cmodel = Publication.BOOK_CONTENT_MODEL

        # connection to repository
        self.repo = ManagementRepository()
        pid_set = self.repo.get_objects_with_cmodel(cmodel, type=Publication)

        try:
            publications = Paginator(pid_set, 100)

        except Exception as e:
            self.output(0, "Error paginating items: : %s " % (e.message))

        #process all Articles
        for p in publications.page_range:
            try:
                objs = publications.page(p).object_list
            except Exception as e:
                #print error and go to next iteration of loop
                self.output(0,"Error getting page: %s : %s " % (p, e.message))
                continue
            for publication in objs:
                try:
                    if not publication.exists:
                        self.output(0, "Skipping %s because pid does not exist" % publication.pid)
                        continue
                    else:
                        if not publication.has_model(Publication.PUBLICATION_CONTENT_MODEL):
                            publication.add_relationship(relsextns.hasModel, Publication.PUBLICATION_CONTENT_MODEL)
                            publication.save()
                    else:
                        continue


                except Exception as e:
                    self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message))
    def handle(self, *args, **options):
        self.options = options
        self.verbosity = int(options['verbosity'])    # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1

        # counters
        self.counts = defaultdict(int)

        # duplicates list
        self.duplicates = {}

        # set the name of the report of duplications
        self.reportsdirectory = settings.REPORTS_DIR
        self.reportname = "replaces-report-%s.txt" % strftime("%Y-%m-%dT%H-%M-%S")

        # connection to repository
        self.repo = ManagementRepository()

        # get last run time and set new one
        time_zone = pytz.timezone('US/Eastern')

        last_run = LastRun.objects.get(name='Convert Symp to OE')
        date = last_run.start_time

        self.output(1, '%s EST' % date.strftime("%Y-%m-%dT%H:%M:%S"))
        date = time_zone.localize(date)
        date = date.astimezone(pytz.utc)
        date_str = date.strftime("%Y-%m-%dT%H:%M:%S")
        self.output(1, '%s UTC' % date_str)

        try:
            # Raise error if replace or ignore is not specified
            if self.options['replace'] is self.options['ignore']:
                raise Exception("no actions set. Specify --replace or --ignore")

            #if pids specified, use that list
            if len(args) != 0:
                pids = list(args)
            else:
                raise Exception("no pids specified")
        except Exception as e:
            raise Exception("Error getting pids: %s" % e.message)

        self.counts['total'] = len(pids)

        for pid in pids:
            try:
                self.output(1, "\nProcessing %s" % pid)
                # Load first as Article becauce that is the most likely type
                obj = self.repo.get_object(pid=pid)
                if not obj.exists:
                    self.output(1, "Skipping because %s does not exist" % pid)
                    continue
                ds = obj.getDatastreamObject('SYMPLECTIC-ATOM')

                if not ds:
                    self.output(1, "Skipping %s because SYMPLECTIC-ATOM ds does not exist" % pid)
                    continue
                ds_mod = ds.last_modified().strftime("%Y-%m-%dT%H:%M:%S")
                #
                # for property, value in vars(ds).iteritems():
                #     msg = "%s: %s" %(property, value)
                #     self.output(1, msg)


                # WHEN OVERWRITING ORINGIALS WITH A DUPLICATE
                # 1. Make sure object content model has from_symp() function
                # 2. Add to  content_types dict
                # 3. Add elif block (see few lines below)
                # 4. Add line in summary section

                # choose content type
                content_types = {'Article': 'journal article', 'Book': 'book', 'Chapter': 'chapter', 'Conference': 'conference', 'Poster': 'poster', 'Report': 'report', 'Presentation': 'presentation'}
                obj_types = ds.content.node.xpath('atom:category/@label', namespaces={'atom': 'http://www.w3.org/2005/Atom'})
                if obj_types[1] in content_types.values():
                    logging.info("Processing %s as Publication" % pid)
                    obj = self.repo.get_object(pid=pid, type=Publication)
                else:
                    logging.info("Skipping %s Invalid Content Type" % pid)
                    continue


                obj.from_symp()

                # get a list of predicates
                properties = []
                for p in list(obj.rels_ext.content.predicates()):
                  properties.append(str(p))

                # process only if the rels-ext has the "replaces" tag, which indicates duplicates
                replaces_tag = "http://purl.org/dc/terms/replaces"
                if replaces_tag in properties:

                    # Get the pubs object
                    # pubs_id = obj.sympAtom.content.serialize().split('<pubs:id>')[1].split('</pubs:id>')[0]
                    pubs_id = obj.sympAtom.content.pubs_id
                    pubs_id = "pubs:%s" % (pubs_id)
                    self.output(1, "Pub ID: %s" % pubs_id)
                    pubs_obj = self.repo.get_object(pid=pubs_id)

                    self.counts['Publication']+=1

                    original_pid = obj.rels_ext.content.serialize().split('<dcterms:replaces rdf:resource="')[1].split('"')[0]
                    original_obj = self.repo.get_object(pid=original_pid, type=Publication)
                    original_obj.from_symp()

                    if not original_obj.exists:
                        self.output(1, "Skipping because %s does not exist" % original_obj)
                        self.counts['skipped']+=1
                        continue

                    if not pid in original_obj.rels_ext.content.serialize():
                        self.output(1, "Skipping because %s does not contain %s" % (original_obj, pid) )
                        self.counts['skipped']+=1
                        continue

                    self.output(1, "Original pid: %s\n Duplicate pid: %s" % (original_pid, pid))

                    # REPLACE ORIGINAL WITH DUPLICATE
                    if self.options['replace']:
                        original_obj.sympAtom.content = obj.sympAtom.content

                        # replace PDF
                        mime = None
                        mime_ds_list = [i for i in obj.ds_list if obj.ds_list[i].mimeType in obj.allowed_mime_types.values()]

                        if mime_ds_list:
                            # sort by DS timestamp does not work yet asks for global name obj because of lambda function
                            new_dict = {}
                            for mime in mime_ds_list:
                                new_dict[mime] = obj.getDatastreamObject(mime).last_modified()

                            sorted_mimes = sorted(new_dict.items(), key=lambda x: x[1])

                            # sorted_mimes = sorted(mime_ds_list, key=lambda p: str(obj.getDatastreamObject(p).last_modified()))
                            mime = sorted_mimes[-1][0]  # most recent
                            original_obj.pdf.content = obj.getDatastreamObject(mime).content

                    # IGNORE DUPLICATE
                    elif self.options['ignore']:
                        self.reportname = "ignore-report-%s.txt" % strftime("%Y-%m-%dT%H-%M-%S")

                    # Add to duplicate dict for report
                    self.duplicates[pid.replace('info:fedora/','')] = original_pid.replace('info:fedora/','')

                    # Update pubs object to point hasCurrent and hasVisible attibutes to the original_pid
                    sympns = Namespace('info:symplectic/symplectic-elements:def/model#')
                    pubs_obj.rels_ext.content.bind('symp', sympns)
                    has_current = (URIRef("info:fedora/"+pubs_obj.pid),\
                                    URIRef('info:symplectic/symplectic-elements:def/model#hasCurrent'), \
                                    URIRef(original_pid))
                    has_visible = (URIRef("info:fedora/"+pubs_id),\
                                    URIRef('info:symplectic/symplectic-elements:def/model#hasVisible'), \
                                    URIRef(original_pid))
                    # hasCurrent
                    pubs_obj.rels_ext.content.remove(has_current)
                    pubs_obj.rels_ext.content.set(has_current)

                    # hasVisible
                    pubs_obj.rels_ext.content.remove(has_visible)
                    pubs_obj.rels_ext.content.set(has_visible)

                    # Close pubs rels_ext object
                    pubs_obj.rels_ext.content.close()

                    # SAVE OBJECTS UNLESS NOACT OPTION
                    if not options['noact']:
                        original_obj.save()
                        pubs_obj.save()
                        self.counts['saved']+=1

                # if not a duplicate
                else:
                    self.output(1, "Skipping because %s is not a duplicate" % pid)
                    self.counts['skipped']+=1
                    continue


            except (KeyboardInterrupt, SystemExit):
                if self.counts['saved'] > 0:
                  self.write_report(self.duplicates, error="interrupt")
                raise

            except Exception as e:
                self.output(1, "Error processing %s: %s" % (pid, e.message))
                self.output(1, obj.rels_ext.content.serialize(pretty=True))
                self.counts['errors']+=1

        # summarize what was done
        self.stdout.write("\n\n")
        self.stdout.write("Total number selected: %s\n" % self.counts['total'])
        self.stdout.write("Skipped: %s\n" % self.counts['skipped'])
        self.stdout.write("Errors: %s\n" % self.counts['errors'])
        self.stdout.write("Converted: %s\n" % self.counts['saved'])

        if self.counts['saved'] > 0:
          self.write_report(self.duplicates)
    def handle(self, *args, **options):
        self.verbosity = int(options['verbosity'])    # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1

        # counters
        self.counts = defaultdict(int)

        # connection to repository
        repo = ManagementRepository()

        # Symplectic-Elements setup
        self.session = requests.Session()
        self.session.auth = (settings.SYMPLECTIC_USER, settings.SYMPLECTIC_PASSWORD)
        self.session.verify=False
        self.session.stream=True
        self.session.headers.update({'Content-Type': 'text/xml'})

        self.pub_query_url = "%s/%s" % (settings.SYMPLECTIC_BASE_URL, "publications")
        self.pub_create_url = "%s/%s" % (settings.SYMPLECTIC_BASE_URL, "publication/records/manual")
        self.relation_create_url = "%s/%s" % (settings.SYMPLECTIC_BASE_URL, "relationships")

        # if pids specified, use that list
        try:
            if len(args) != 0:
                pids = list(args)
                pid_set = [repo.get_object(pid=p,type=Publication) for p in pids]


            else:
                #search for Articles.
                pid_set = repo.get_objects_with_cmodel(Publication.ARTICLE_CONTENT_MODEL, Article)

        except Exception as e:
            raise CommandError('Error getting pid list (%s)' % e.message)

        try:
            articles = Paginator(pid_set, 20)
            self.counts['total'] = articles.count
        except Exception as e:
            self.output(0, "Error paginating items: : %s " % (e.message))

        #process all Articles
        for p in articles.page_range:
            try:
                objs = articles.page(p).object_list
            except Exception as e:
                #print error and go to next iteration of loop
                self.output(0,"Error getting page: %s : %s " % (p, e.message))
                self.counts['errors'] +=1
                continue
            for article in objs:
                try:
                    # if not article.exists:
                    #     self.output(1, "Skipping %s because pid does not exist" % article.pid)
                    #     self.counts['skipped'] +=1
                    #     continue
                    # title = article.descMetadata.content.title_info.title if (article.descMetadata.content.title_info and article.descMetadata.content.title_info.title) else None
                    # if title is None or title == '':
                    #     self.output(1, "Skipping %s because OE Title does not exist" % (article.pid))
                    #     self.counts['skipped'] +=1
                    #     continue

                    # if not article.is_published:
                    #     self.output(1, "Skipping %s because pid is not published" % article.pid)
                    #     self.counts['skipped'] +=1
                    #     continue

                    # # try to detect article by PMC
                    # if article.pmcid and not options['force']:
                    #     response = self.session.get(self.pub_query_url, params = {'query' : 'external-identifiers.pmc="PMC%s"' % article.pmcid, 'detail': 'full'})
                    #     entries = load_xmlobject_from_string(response.raw.read(), OESympImportPublication).entries
                    #     self.output(2, "Query for PMC Match: GET %s %s" % (response.url, response.status_code))
                    #     if response.status_code == 200:
                    #         if len(entries) >= 1:
                    #             self.output(1, "Skipping %s because PMC PMC%s already exists" % (article.pid, article.pmcid))
                    #             self.counts['skipped'] +=1

                    #             if options['rel']:
                    #                 symp_pub, relations = article.as_symp(source=entries[0].source, source_id=entries[0].source_id)
                    #                 self.process_relations(entries[0].source_id, relations, options)
                    #                 sleep(1)
                    #             continue
                    #     else:
                    #         self.output(1, "Skipping %s because trouble with request %s %s" % (article.pid, response.status_code, entries[0].title))
                    #         self.counts['skipped'] +=1
                    #         continue

                    # # try to detect article by Title if it does not have PMC
                    # if not options['force']:
                    #     response = self.session.get(self.pub_query_url, params = {'query' : 'title~"%s"' % title, 'detail': 'full'})
                    #     entries = load_xmlobject_from_string(response.raw.read(), OESympImportPublication).entries
                    #     # Accouont for mutiple results
                    #     titles = [e.title for e in entries]
                    #     self.output(2, "Query for Title Match: GET %s %s" % (response.url, response.status_code))
                    #     if response.status_code == 200:
                    #         found = False
                    #         for t in titles:
                    #             success, percent = percent_match(title, t, 90)
                    #             self.output(1, "Percent Title Match '%s' '%s' %s " % (title, t, percent))
                    #             if success:
                    #                 found = True
                    #         if found:
                    #             self.output(1, "Skipping %s because Title \"%s\" already exists" % (article.pid, title))
                    #             self.counts['skipped'] +=1

                    #             # update relations if rel is set
                    #             if options['rel']:
                    #                 symp_pub, relations = article.as_symp(source=entries[0].source, source_id=entries[0].source_id)
                    #                 self.process_relations(entries[0].source_id, relations, options)
                    #                 sleep(1)
                    #             continue
                    #     else:
                    #         self.output(1, "Skipping %s because trouble with request %s %s" % (article.pid, response.status_code, entries[0].title))
                    #         self.counts['skipped'] +=1
                    #         continue

                    # Process article and relations
                    symp_pub, relations = article.as_symp()
                    self.process_article(article.pid, symp_pub, options)
                    self.process_relations(article.pid, relations, options)
                    sleep(1)

                except Exception as e:
                    self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message))
                    import traceback
                    traceback.print_exc()
                    self.counts['errors'] +=1

        # summarize what was done
        self.stdout.write("\n\n")
        self.stdout.write("Total number selected: %s\n" % self.counts['total'])
        self.stdout.write("Skipped: %s\n" % self.counts['skipped'])
        self.stdout.write("Errors: %s\n" % self.counts['errors'])
        self.stdout.write("Warnings: %s\n" % self.counts['warnings'])
        self.stdout.write("Articles Processed: %s\n" % self.counts['articles_processed'])
        self.stdout.write("Relations Processed: %s\n" % self.counts['relations_processed'])
Exemplo n.º 4
0
    def handle(self, *args, **options):

        self.verbosity = int(options['verbosity'])    # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1


        # connection to repository
        self.repo = ManagementRepository()
        pid_set = self.repo.get_objects_with_cmodel(Publication.ARTICLE_CONTENT_MODEL, type=Publication)
        writer = csv.writer(open("publications_csv.csv", 'wb'))
        writer.writerow([
            smart_str(u"PID"),
            smart_str(u"Title"),
            smart_str(u"Withdrawn"),
            smart_str(u"Authors"),
            smart_str(u"Journal Title"),
            smart_str(u"Publisher"),
            smart_str(u"Version"),
            smart_str(u"Final Published Link"),
            smart_str(u"DOI"),
            smart_str(u"Subjects"),
            smart_str(u"Funding Group"),
            smart_str(u"CC License"),
            smart_str(u"Copyright Statement"),
            smart_str(u"Admin Note"),
            smart_str(u"Date Reviewed"),
            smart_str(u"Rights Research Date"),
            smart_str(u"PMC"),
            smart_str(u"PUBSID"),
            smart_str(u"File Deposited"),

        ])

        try:
            articles = Paginator(pid_set, 100)

        except Exception as e:
            self.output(0, "Error paginating items: : %s " % (e.message))

        #process all Articles
        for p in articles.page_range:
            try:
                objs = articles.page(p).object_list
            except Exception as e:
                #print error and go to next iteration of loop
                self.output(0,"Error getting page: %s : %s " % (p, e.message))
                continue
            for article in objs:
                try:
                    if not article.exists:
                        self.output(0, "Skipping %s because pid does not exist" % article.pid)
                        continue
                    else:
                        mods = article.descMetadata.content
                        symp = article.sympAtom.content
                        authors = []
                        subjects = []
                        funders = []
                        for author in mods.authors:
                            authors.append('%s %s' % (author.given_name, author.family_name))
                        for subject in mods.subjects:
                            subjects.append(subject.topic)
                        for funder in mods.funders:
                            funders.append(funder.name)

                        writer.writerow([
                            smart_str(article.pid if article.pid else ''),
                            smart_str(article.label if article.label else ''),
                            smart_str(article.is_withdrawn),
                            smart_str(",".join(authors)),
                            smart_str(mods.journal.title if mods.journal else ''),
                            smart_str(mods.journal.publisher if mods.journal else ''),
                            smart_str(mods.version if mods.version else ''),
                            smart_str(mods.final_version.url if mods.final_version else ''),
                            smart_str(mods.final_version.doi if mods.final_version else ''),
                            smart_str(",".join(subjects)),
                            smart_str(",".join(funders)),
                            smart_str(mods.license.text if mods.license else ''),
                            smart_str(mods.copyright.text if mods.copyright else ''),
                            smart_str(mods.admin_note.text if mods.admin_note else ''),
                            smart_str(article.provenance.content.date_reviewed if article.provenance else ''),
                            smart_str(mods.rights_research_date if mods.rights_research_date else ''),
                            smart_str(article.pmcid if article.pmcid else ''),
                            smart_str(symp.pubs_id if symp else ''),
                            smart_str("Yes" if article.pdf.exists else 'No'),


                        ])

                except Exception as e:
                    self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message))
                    # self.counts['errors'] +=1
        writer.close()
Exemplo n.º 5
0
class Command(BaseCommand):
    ''' This command run through all the articles and makes sure that journal titles and publishers match against Sherpa Romeo
    '''
    args = "[netid netid ...]"
    help = __doc__

    option_list = BaseCommand.option_list + (
        make_option('--noact', '-n',
                    action='store_true',
                    default=False,
                    help='Fixed all caps title in articles'),
        )

    def handle(self, *args, **options):

        self.verbosity = int(options['verbosity'])    # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1


        # connection to repository
        self.repo = ManagementRepository()
        pid_set = self.repo.get_objects_with_cmodel(Publication.ARTICLE_CONTENT_MODEL, type=Publication)
        writer = csv.writer(open("publications_csv.csv", 'wb'))
        writer.writerow([
            smart_str(u"PID"),
            smart_str(u"Title"),
            smart_str(u"Withdrawn"),
            smart_str(u"Authors"),
            smart_str(u"Journal Title"),
            smart_str(u"Publisher"),
            smart_str(u"Version"),
            smart_str(u"Final Published Link"),
            smart_str(u"DOI"),
            smart_str(u"Subjects"),
            smart_str(u"Funding Group"),
            smart_str(u"CC License"),
            smart_str(u"Copyright Statement"),
            smart_str(u"Admin Note"),
            smart_str(u"Date Reviewed"),
            smart_str(u"Rights Research Date"),
            smart_str(u"PMC"),
            smart_str(u"PUBSID"),
            smart_str(u"File Deposited"),

        ])

        try:
            articles = Paginator(pid_set, 100)

        except Exception as e:
            self.output(0, "Error paginating items: : %s " % (e.message))

        #process all Articles
        for p in articles.page_range:
            try:
                objs = articles.page(p).object_list
            except Exception as e:
                #print error and go to next iteration of loop
                self.output(0,"Error getting page: %s : %s " % (p, e.message))
                continue
            for article in objs:
                try:
                    if not article.exists:
                        self.output(0, "Skipping %s because pid does not exist" % article.pid)
                        continue
                    else:
                        mods = article.descMetadata.content
                        symp = article.sympAtom.content
                        authors = []
                        subjects = []
                        funders = []
                        for author in mods.authors:
                            authors.append('%s %s' % (author.given_name, author.family_name))
                        for subject in mods.subjects:
                            subjects.append(subject.topic)
                        for funder in mods.funders:
                            funders.append(funder.name)

                        writer.writerow([
                            smart_str(article.pid if article.pid else ''),
                            smart_str(article.label if article.label else ''),
                            smart_str(article.is_withdrawn),
                            smart_str(",".join(authors)),
                            smart_str(mods.journal.title if mods.journal else ''),
                            smart_str(mods.journal.publisher if mods.journal else ''),
                            smart_str(mods.version if mods.version else ''),
                            smart_str(mods.final_version.url if mods.final_version else ''),
                            smart_str(mods.final_version.doi if mods.final_version else ''),
                            smart_str(",".join(subjects)),
                            smart_str(",".join(funders)),
                            smart_str(mods.license.text if mods.license else ''),
                            smart_str(mods.copyright.text if mods.copyright else ''),
                            smart_str(mods.admin_note.text if mods.admin_note else ''),
                            smart_str(article.provenance.content.date_reviewed if article.provenance else ''),
                            smart_str(mods.rights_research_date if mods.rights_research_date else ''),
                            smart_str(article.pmcid if article.pmcid else ''),
                            smart_str(symp.pubs_id if symp else ''),
                            smart_str("Yes" if article.pdf.exists else 'No'),


                        ])

                except Exception as e:
                    self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message))
                    # self.counts['errors'] +=1
        writer.close()


    def output(self, v, msg):
        '''simple function to handle logging output based on verbosity'''
        if self.verbosity >= v:
            self.stdout.write("%s\n" % msg)
Exemplo n.º 6
0
    def handle(self, *args, **options):
        self.options = options
        self.verbosity = int(options['verbosity'])    # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1

        # counters
        self.counts = defaultdict(int)

        # duplicates list
        self.duplicates = {}

        # set the name of the report of duplications
        self.reportsdirectory = settings.REPORTS_DIR
        self.reportname = "replaces-report-%s.txt" % strftime("%Y-%m-%dT%H-%M-%S")

        # connection to repository
        self.repo = ManagementRepository()

        # get last run time and set new one
        time_zone = pytz.timezone('US/Eastern')

        last_run = LastRun.objects.get(name='Convert Symp to OE')
        date = last_run.start_time

        self.output(1, '%s EST' % date.strftime("%Y-%m-%dT%H:%M:%S"))
        date = time_zone.localize(date)
        date = date.astimezone(pytz.utc)
        date_str = date.strftime("%Y-%m-%dT%H:%M:%S")
        self.output(1, '%s UTC' % date_str)

        try:
            #if pids specified, use that list
            if len(args) != 0:
                pids = list(args)
            else:
                raise Exception("no pids specified")
        except Exception as e:
            raise Exception("Error getting pids: %s" % e.message)

        self.counts['total'] = len(pids)

        try:
            self.output(1, "\nProcessing %s" % pids[0])
            # Load first as Article becauce that is the most likely type
            obj = self.repo.get_object(pid=pids[0])
            if not obj.exists:
                self.output(1, "Skipping because %s does not exist" % pids[0])
                raise Exception("Error getting pids: %s" % e.message)
                # continue

            # choose content type
            content_types = {'Article': 'journal article'}

            obj_types = ds.content.node.xpath('atom:category/@label', namespaces={'atom': 'http://www.w3.org/2005/Atom'})
            if obj_types[1] in content_types.values():
                logging.info("Processing %s as Publication" % pids[0])
                obj = self.repo.get_object(pid=pids[0], type=Publication)
            else:
                logging.info("Skipping %s Invalid Content Type" % pids[0])
                raise Exception("Error getting pids: %s" % e.message)
                # continue

            # Get the pubs object
            # pubs_id = obj.sympAtom.content.serialize().split('<pubs:id>')[1].split('</pubs:id>')[0]
            pubs_id = "pubs:%s" % (pids[1])
            self.output(1, "Pub ID: %s" % pubs_id)
            #ingesting new pubs_id object
            foxml = '<?xml version="1.0" encoding="UTF-8"?>'
                    '<foxml:digitalObject VERSION="1.1" PID="'+ pubs_id +'"'
                    'xmlns:foxml="info:fedora/fedora-system:def/foxml#"'
                    'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"'
                    'xsi:schemaLocation="info:fedora/fedora-system:def/foxml# http://www.fedora.info/definitions/1/0/foxml1-1.xsd">'
                    '<foxml:objectProperties>'
                    '<foxml:property NAME="info:fedora/fedora-system:def/model#state" VALUE="Active"/>'
                    '</foxml:objectProperties>'
                    '</foxml:digitalObject>'
            pubs_obj = self.repo.ingest(text=foxml)
            obj = repo.get_object(pid=pubs_id)
            obj.dc.content.identifier_list.extend(pubs_id)
            original_pid = repo.get_object(pid=pids[0], type=Publication)
            # pubs_dc = '<oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd"><dc:identifier>'+ pubs_id +'</dc:identifier></oai_dc:dc>'
            # pubs_obj.dc.content = pubs_dc


            # Update pubs object to point hasCurrent and hasVisible attibutes to the original_pid
            sympns = Namespace('info:symplectic/symplectic-elements:def/model#')
            pubs_obj.rels_ext.content.bind('symp', sympns)
            has_current = (URIRef("info:fedora/"+obj.pid),\
                            URIRef('info:symplectic/symplectic-elements:def/model#hasCurrent'), \
                            URIRef(original_pid))
            has_visible = (URIRef("info:fedora/"+pubs_id),\
                            URIRef('info:symplectic/symplectic-elements:def/model#hasVisible'), \
                            URIRef(original_pid))
            # hasCurrent
            obj.rels_ext.content.set(has_current)

            # hasVisible
            obj.rels_ext.content.set(has_visible)

            # Close pubs rels_ext object
            obj.rels_ext.content.close()

            
            symp_pub, relations = original_pid.as_symp()
            self.process_article(original_pid.pid, symp_pub, options)
            self.process_relations(original_pid.pid, relations, options)

            # SAVE OBJECTS UNLESS NOACT OPTION
            if not options['noact']:
                original_obj.save()
                pubs_obj.save()
                self.counts['saved']+=1
    def handle(self, *args, **options):
        


        self.options = options
        self.verbosity = int(options['verbosity'])    # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1
        
        year = date.today().year
        quarter = year_quarter(date.today().month) #get the quarter 1, 2, 3, 4

        # counters
        self.counts = defaultdict(int)

        # set the name of the report of duplications
        self.reportsdirectory = settings.REPORTS_DIR
        self.reportname = "merge-report-%s.txt" % strftime("%Y-%m-%dT%H-%M-%S")

        # connection to repository
        self.repo = ManagementRepository()


        try:
            #if pids specified, use that list
            if len(args) == 2:
                pids = list(args)
            else:
                raise Exception("specify two pid")
        except Exception as e:
            raise Exception("Error getting pids: %s" % e.message)

        self.counts['total'] = len(pids)

        for idx,pid in enumerate(pids):
            try:
                if idx == 0:
                    self.output(1, "\nProcessing  Elements PID %s" % pid)
                     # Load first as Article becauce that is the most likely type
                    element_obj = self.repo.get_object(pid=pid, type=Publication)
                    element_stats = ArticleStatistics.objects.filter(pid=pid)
                    if element_stats:
                        element_stats.delete()
                    if not element_obj.exists:
                        self.output(1, "Skipping because %s does not exist" % pid)
                        continue
                elif idx == 1:
                    self.output(1, "\nProcessing  Old PID %s" % pid)
                    original_obj = self.repo.get_object(pid=pid, type=Publication)
                    if not original_obj.exists:
                        self.output(1, "Skipping because %s does not exist" % pid)
                        continue
                    original_stats = ArticleStatistics.objects.filter(pid=pid)
                    if not original_stats:
                        original_stats = ArticleStatistics.objects.create(pid=pid, year=year, quarter=quarter)
               
                
                


            except (KeyboardInterrupt, SystemExit):
                if self.counts['saved'] > 0:
                  self.write_report(self.duplicates, error="interrupt")
                raise

            except Exception as e:
                self.output(1, "Error processing %s: %s" % (pid, e.message))
                self.output(1, element_obj.rels_ext.content.serialize(pretty=True))
                self.counts['errors']+=1

        element_obj.descMetadata.content = original_obj.descMetadata.content
        element_obj.provenance.content = original_obj.provenance.content
        element_obj.dc.content = original_obj.dc.content
        if original_obj.pdf.content:
            element_obj.pdf.content = original_obj.pdf.content
        original_obj.state = 'I'
        element_obj.provenance.content.init_object(element_obj.pid, 'pid')
        element_obj.provenance.content.merged(original_obj.pid, element_obj.pid)
        
        ArticleStatistics.objects.filter(pid=element_obj.pid).delete()
        for stat in original_stats:
            ArticleStatistics.objects.create(pid=element_obj.pid, year=stat.year, quarter=stat.quarter, num_downloads=stat.num_downloads, num_views=stat.num_views)
        
        coll = self.repo.get_object(pid=settings.PID_ALIASES['oe-collection'])
        element_obj.collection = coll
        element_obj.rels_ext.content.add((element_obj.uriref, relsextns.hasModel, URIRef(Publication.ARTICLE_CONTENT_MODEL)))
        element_obj.rels_ext.content.add((element_obj.uriref, relsextns.hasModel, URIRef(Publication.PUBLICATION_CONTENT_MODEL)))

        
        # SAVE OBJECTS UNLESS NOACT OPTION
        if not options['noact']:
            element_obj.save()
            original_obj.save()
            self.counts['saved']+=1

        # summarize what was done
        self.stdout.write("\n\n")
        self.stdout.write("Total number selected: %s\n" % self.counts['total'])
        self.stdout.write("Skipped: %s\n" % self.counts['skipped'])
        self.stdout.write("Errors: %s\n" % self.counts['errors'])
        self.stdout.write("Converted: %s\n" % self.counts['saved'])
class Command(BaseCommand):
    '''Provides merge/ignore options for duplicate objects created by Elements connector for manual duplicate management.
        This alters the pubs_object that the original and duplicate share.
    '''
    args = "[pid pid ...]"
    help = __doc__

    option_list = BaseCommand.option_list + (
        make_option('--noact', '-n',
                    action='store_true',
                    default=False,
                    help='Reports the pid and total number of object that would be processed but does not really do anything.'),
        make_option('--ignore', '-i',
                    action='store_true',
                    default=False,
                    help='Changes the pub object to disregard the duplicate pids.'),
        make_option('--merge', '-m',
                    action='store_true',
                    default=False,
                    help='Keeps the changes from the duplicate pids by copying ATOM-FEED to original.'),
        )

    def handle(self, *args, **options):
        


        self.options = options
        self.verbosity = int(options['verbosity'])    # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1
        
        year = date.today().year
        quarter = year_quarter(date.today().month) #get the quarter 1, 2, 3, 4

        # counters
        self.counts = defaultdict(int)

        # set the name of the report of duplications
        self.reportsdirectory = settings.REPORTS_DIR
        self.reportname = "merge-report-%s.txt" % strftime("%Y-%m-%dT%H-%M-%S")

        # connection to repository
        self.repo = ManagementRepository()


        try:
            #if pids specified, use that list
            if len(args) == 2:
                pids = list(args)
            else:
                raise Exception("specify two pid")
        except Exception as e:
            raise Exception("Error getting pids: %s" % e.message)

        self.counts['total'] = len(pids)

        for idx,pid in enumerate(pids):
            try:
                if idx == 0:
                    self.output(1, "\nProcessing  Elements PID %s" % pid)
                     # Load first as Article becauce that is the most likely type
                    element_obj = self.repo.get_object(pid=pid, type=Publication)
                    element_stats = ArticleStatistics.objects.filter(pid=pid)
                    if element_stats:
                        element_stats.delete()
                    if not element_obj.exists:
                        self.output(1, "Skipping because %s does not exist" % pid)
                        continue
                elif idx == 1:
                    self.output(1, "\nProcessing  Old PID %s" % pid)
                    original_obj = self.repo.get_object(pid=pid, type=Publication)
                    if not original_obj.exists:
                        self.output(1, "Skipping because %s does not exist" % pid)
                        continue
                    original_stats = ArticleStatistics.objects.filter(pid=pid)
                    if not original_stats:
                        original_stats = ArticleStatistics.objects.create(pid=pid, year=year, quarter=quarter)
               
                
                


            except (KeyboardInterrupt, SystemExit):
                if self.counts['saved'] > 0:
                  self.write_report(self.duplicates, error="interrupt")
                raise

            except Exception as e:
                self.output(1, "Error processing %s: %s" % (pid, e.message))
                self.output(1, element_obj.rels_ext.content.serialize(pretty=True))
                self.counts['errors']+=1

        element_obj.descMetadata.content = original_obj.descMetadata.content
        element_obj.provenance.content = original_obj.provenance.content
        element_obj.dc.content = original_obj.dc.content
        if original_obj.pdf.content:
            element_obj.pdf.content = original_obj.pdf.content
        original_obj.state = 'I'
        element_obj.provenance.content.init_object(element_obj.pid, 'pid')
        element_obj.provenance.content.merged(original_obj.pid, element_obj.pid)
        
        ArticleStatistics.objects.filter(pid=element_obj.pid).delete()
        for stat in original_stats:
            ArticleStatistics.objects.create(pid=element_obj.pid, year=stat.year, quarter=stat.quarter, num_downloads=stat.num_downloads, num_views=stat.num_views)
        
        coll = self.repo.get_object(pid=settings.PID_ALIASES['oe-collection'])
        element_obj.collection = coll
        element_obj.rels_ext.content.add((element_obj.uriref, relsextns.hasModel, URIRef(Publication.ARTICLE_CONTENT_MODEL)))
        element_obj.rels_ext.content.add((element_obj.uriref, relsextns.hasModel, URIRef(Publication.PUBLICATION_CONTENT_MODEL)))

        
        # SAVE OBJECTS UNLESS NOACT OPTION
        if not options['noact']:
            element_obj.save()
            original_obj.save()
            self.counts['saved']+=1

        # summarize what was done
        self.stdout.write("\n\n")
        self.stdout.write("Total number selected: %s\n" % self.counts['total'])
        self.stdout.write("Skipped: %s\n" % self.counts['skipped'])
        self.stdout.write("Errors: %s\n" % self.counts['errors'])
        self.stdout.write("Converted: %s\n" % self.counts['saved'])


    def output(self, v, msg):
        '''simple function to handle logging output based on verbosity'''
        if self.verbosity >= v:
            self.stdout.write("%s\n" % msg)
Exemplo n.º 9
0
class Command(BaseCommand):
    ''' This command run through all the articles and makes sure that journal titles and publishers match against Sherpa Romeo
    '''
    args = "[netid netid ...]"
    help = __doc__

    option_list = BaseCommand.option_list + (
        make_option('--noact', '-n',
                    action='store_true',
                    default=False,
                    help='Fixed all caps title in articles'),
        )

    def handle(self, *args, **options):

        self.verbosity = int(options['verbosity'])    # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1

        # connection to repository
        self.repo = ManagementRepository()
        pid_set = self.repo.get_objects_with_cmodel(Publication.ARTICLE_CONTENT_MODEL, type=Publication)


        try:
            articles = Paginator(pid_set, 100)

        except Exception as e:
            self.output(0, "Error paginating items: : %s " % (e.message))

        #process all Articles
        for p in articles.page_range:
            try:
                objs = articles.page(p).object_list
            except Exception as e:
                #print error and go to next iteration of loop
                self.output(0,"Error getting page: %s : %s " % (p, e.message))
                continue
            for article in objs:
                try:
                    if not article.exists:
                        self.output(0, "Skipping %s because pid does not exist" % article.pid)
                        continue
                    else:
                        mods = article.descMetadata.content
                        if mods.journal is not None:
                            if mods.journal.title is not None:
                                try:
                                    journals = romeo.search_journal_title(mods.journal.title, type='starts') if mods.journal.title else []
                                    suggestions = [journal_suggestion_data(journal) for journal in journals]
                                    if mods.journal.title.lower() in map(str.lower, JOURNAL_LIST):
                                        mods.journal.title = suggestions[0]['value']
                                        print "JOURNAL"
                                        print mods.journal.title
                                        article.save()
                                    else:
                                        continue

                                except:
                                    suggestions = []

                            # if mods.journal.publisher is not None:
                            #     try:
                            #         publishers = romeo.search_publisher_name(mods.journal.publisher, versions='all')
                            #         suggestions = [publisher_suggestion_data(pub) for pub in publishers]
                            #         mods.journal.publisher = suggestions[0]['value']
                            #         print "PUBLISHER"
                            #         print mods.journal.publisher
                            #     except:
                            #         suggestions = []

                        else:
                            continue


                except Exception as e:
                    self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message))
                    # self.counts['errors'] +=1


    def output(self, v, msg):
        '''simple function to handle logging output based on verbosity'''
        if self.verbosity >= v:
            self.stdout.write("%s\n" % msg)
Exemplo n.º 10
0
    def handle(self, *args, **options):

        self.verbosity = int(options['verbosity'])    # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1

        # connection to repository
        self.repo = ManagementRepository()
        pid_set = self.repo.get_objects_with_cmodel(Publication.ARTICLE_CONTENT_MODEL, type=Publication)


        try:
            articles = Paginator(pid_set, 100)

        except Exception as e:
            self.output(0, "Error paginating items: : %s " % (e.message))

        #process all Articles
        for p in articles.page_range:
            try:
                objs = articles.page(p).object_list
            except Exception as e:
                #print error and go to next iteration of loop
                self.output(0,"Error getting page: %s : %s " % (p, e.message))
                continue
            for article in objs:
                try:
                    if not article.exists:
                        self.output(0, "Skipping %s because pid does not exist" % article.pid)
                        continue
                    else:
                        mods = article.descMetadata.content
                        if mods.journal is not None:
                            if mods.journal.title is not None:
                                try:
                                    journals = romeo.search_journal_title(mods.journal.title, type='starts') if mods.journal.title else []
                                    suggestions = [journal_suggestion_data(journal) for journal in journals]
                                    if mods.journal.title.lower() in map(str.lower, JOURNAL_LIST):
                                        mods.journal.title = suggestions[0]['value']
                                        print "JOURNAL"
                                        print mods.journal.title
                                        article.save()
                                    else:
                                        continue

                                except:
                                    suggestions = []

                            # if mods.journal.publisher is not None:
                            #     try:
                            #         publishers = romeo.search_publisher_name(mods.journal.publisher, versions='all')
                            #         suggestions = [publisher_suggestion_data(pub) for pub in publishers]
                            #         mods.journal.publisher = suggestions[0]['value']
                            #         print "PUBLISHER"
                            #         print mods.journal.publisher
                            #     except:
                            #         suggestions = []

                        else:
                            continue


                except Exception as e:
                    self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message))
    def handle(self, *args, **options):
        self.options = options
        self.verbosity = int(options['verbosity'])    # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1

        #counters
        self.counts = defaultdict(int)

        # duplicates list
        self.duplicates = {}


        # error list
        self.errors = {}

        # set the name of the report of duplications
        self.reportsdirectory = settings.REPORTS_DIR
        self.reportname = "duplicates-report-%s.txt" % strftime("%Y-%m-%dT%H-%M-%S")

        # connection to repository
        self.repo = ManagementRepository()

        # get last run time and set new one
        time_zone = pytz.timezone('US/Eastern')
        if not options['date']:
            last_run = LastRun.objects.get(name='Convert Symp to OE')
            date = last_run.start_time
        else:
           try:
               date = datetime.strptime(options['date'], '%Y-%m-%dT%H:%M:%S')
           except:
               raise CommandError("Could not parse date")
        if options['date'] and len(args) !=0:
            raise CommandError('Can not use date option with list of pids')

        if (not options['date']) and  (len(args) == 0) and (not options['noact']) and (not options['force']):
            last_run.start_time = datetime.now()
            last_run.save()

        logging.info('%s EST' % date.strftime("%Y-%m-%dT%H:%M:%S"))
        date = time_zone.localize(date)
        date = date.astimezone(pytz.utc)
        date_str = date.strftime("%Y-%m-%dT%H:%M:%S")

        logging.info('%s UTC' % date_str)
        try:
            #if pids specified, use that list
            if len(args) != 0:
                pids = list(args)
            else:
                query = """SELECT ?pid
                        WHERE {
                            ?pid <info:fedora/fedora-system:def/view#disseminates> ?ds.
                             ?pid <info:fedora/fedora-system:def/model#createdDate> ?created.
                        FILTER (
                             regex(str(?ds), 'SYMPLECTIC-ATOM') &&
                             ?created >= xsd:dateTime('%sZ')
                        )
                        }""" % date_str
                pids = [o['pid'] for o in self.repo.risearch.sparql_query(query)]
        except Exception as e:
            raise Exception("Error getting pids: %s" % e.message)

        self.counts['total'] = len(pids)

        for pid in pids:
            try:
                logging.info("Processing %s" % pid)
                # Load first as Publication becauce that is the most likely type
                obj = self.repo.get_object(pid=pid)
                if not obj.exists:
                    logging.warning("Skipping because %s does not exist" % pid)
                    continue
                ds = obj.getDatastreamObject('SYMPLECTIC-ATOM')
                if not ds:
                    logging.warning("Skipping %s because SYMPLECTIC-ATOM ds does not exist" % pid)
                    continue
                ds_mod = ds.last_modified().strftime("%Y-%m-%dT%H:%M:%S")
                if date_str and  ds_mod < date_str and (not options['force']):
                    logging.warning("Skipping %s because SYMPLECTIC-ATOM ds not modified since last run %s " % (pid, ds_mod))
                    self.counts['skipped']+=1
                    continue
                license = obj.getDatastreamObject('SYMPLECTIC-LICENCE')
                if not license.content:
                    logging.warning("Skipping %s because SYMPLECTIC-LICENCE ds not modified since last run %s " % (pid, ds_mod))
                    self.counts['skipped']+=1
                    payload = {"text": "No Assent Publication.\n pid: %s" % pid}
                    r = requests.post(settings.SLACK_TOKEN, data=json.dumps(payload))


                # WHEN ADDING NEW CONTENT TYPES:
                # 1. Make sure object content modle has from_symp() function
                # 2. Add to  content_types dict
                # 3. Add elif block (see few lines below)
                # 4. Add line in summary section of this script

                #choose content type
                content_types = {'Article': 'journal article', 'Book': 'book', 'Chapter': 'chapter', 'Conference': 'conference', 'Poster': 'poster', 'Report': 'report', 'Presentation': 'presentation'}
                obj_types = ds.content.node.xpath('atom:category/@label', namespaces={'atom': 'http://www.w3.org/2005/Atom'})
                if obj_types[1] in content_types.values():
                    logging.info("Processing %s as Publication" % pid)
                    obj = self.repo.get_object(pid=pid, type=Publication)
                else:
                    logging.info("Skipping %s Invalid Content Type" % pid)
                    continue


                obj.from_symp()

                 # get a list of predicates
                properties = []
                for p in list(obj.rels_ext.content.predicates()):
                  properties.append(str(p))
                # skip if the rels-ext has the "replaces tag, which indicates duplicates"
                replaces_tag = "http://purl.org/dc/terms/replaces"
                if replaces_tag in properties:
                    self.counts['duplicates']+=1
                    # get the pid of the original object this is replaceing
                    replaces_pid = obj.rels_ext.content.serialize().split('<dcterms:replaces rdf:resource="')[1].split('"')[0]
                    # add to duplicate dict
                    self.duplicates[pid.replace('info:fedora/','')] = replaces_pid.replace('info:fedora/','')


                    if not obj.is_withdrawn:

                        try:
                            user = User.objects.get(username=u'oebot')

                        except ObjectDoesNotExist:

                            user = User.objects.get_or_create(username=u'bob', password=u'bobspassword',)[0]
                            user.first_name = "Import"
                            user.last_name = "Process"
                            user.save()

                        reason = "Duplicate."
                        self.counts['withdrawn']+=1
                        obj.provenance.content.init_object(obj.pid, 'pid')
                        obj.provenance.content.withdrawn(user,reason)
                        obj.state = 'I'
                        logging.info("Withdrew duplicate pid: %s" % obj.pid)



                else:
                    self.counts['pdf']+=1


                # convert attached PDF fle to be used with OE
                # filter datastreams for only application/pdf
                mime = None
                mime_ds_list = None
                print obj.descMetadata.content.genre

                if obj.descMetadata.content.genre == "Article" or obj.descMetadata.content.genre == "Book" or obj.descMetadata.content.genre == "Chapter":
                    mime_ds_list = [i for i in obj.ds_list if obj.ds_list[i].mimeType in obj.allowed_mime_types.values()]
                elif obj.descMetadata.content.genre == "Conference":

                    mime_ds_list = [i for i in obj.ds_list if obj.ds_list[i].mimeType in obj.allowed_mime_conference.values()]
                elif obj.descMetadata.content.genre == "Report":
                    mime_ds_list = [i for i in obj.ds_list if obj.ds_list[i].mimeType in obj.allowed_mime_report.values()]

                elif obj.descMetadata.content.genre == "Poster":
                    mime_ds_list = [i for i in obj.ds_list if obj.ds_list[i].mimeType in obj.allowed_mime_poster.values()]

                elif obj.descMetadata.content.genre == "Presentation":
                    mime_ds_list = [i for i in obj.ds_list if obj.ds_list[i].mimeType in obj.allowed_mime_presentation.values()]

                else:
                     logging.info("Skipping because mime type is not allowed")
                     continue


                if mime_ds_list:
                    # sort by DS timestamp does not work yet asks for global name obj because of lambda function
                    new_dict = {}
                    for mime in mime_ds_list:
                        new_dict[mime] = obj.getDatastreamObject(mime).last_modified()

                    sorted_mimes = sorted(new_dict.items(), key=lambda x: x[1])

                    # sorted_mimes = sorted(mime_ds_list, key=lambda p: str(obj.getDatastreamObject(p).last_modified()))
                    mime = sorted_mimes[-1][0]  # most recent

                if not options['noact']:
                    obj.save()
                    # obj.index_data()

                    if mime:
                        mime_type =  obj.ds_list[mime].mimeType
                        print mime_type
                        print "####################################"
                        self.repo.api.addDatastream(pid=obj.pid, dsID='content', dsLabel='%s' % mime_type,
                                                mimeType=mime_type, logMessage='added %s content from %s' % (mime_type,mime),
                                                controlGroup='M', versionable=True, content=obj.getDatastreamObject(mime).content)
                        logging.info("Converting %s to %s Content" % (mime,mime_type))
                        self.counts[mime_type]+=1
                        self.counts['Publication']+=1




            except (KeyboardInterrupt, SystemExit):
                if self.counts['duplicates'] > 0:
                  self.write_dup_report(self.duplicates, error="interrupt")
                raise

            except Exception as e:
                logging.error("Error processing %s: %s" % (pid, e.message))
                logging.error(obj.rels_ext.content.serialize(pretty=True))
                self.counts['errors']+=1
                self.errors[pid] = e.message

        # summarize what was done
        self.stdout.write("\n\n")
        self.stdout.write("Total number selected: %s\n" % self.counts['total'])
        self.stdout.write("Skipped: %s\n" % self.counts['skipped'])
        self.stdout.write("Duplicates: %s\n" % self.counts['duplicates'])
        self.stdout.write("Withdrew: %s\n" % self.counts['withdrawn'])
        self.stdout.write("PDFs converted: %s\n" % self.counts['pdf'])
        self.stdout.write("Errors: %s\n" % self.counts['errors'])
        self.stdout.write("Publications converted: %s\n" % self.counts['Publication'])

        if self.counts['duplicates'] > 0 or self.counts['errors'] > 0:
          self.write_dup_report(self.duplicates, self.errors)
Exemplo n.º 12
0
class Command(BaseCommand):
    ''' This command run through all the articles and makes sure that journal titles and publishers match against Sherpa Romeo
    '''
    args = "[netid netid ...]"
    help = __doc__

    option_list = BaseCommand.option_list + (
        make_option('--article', '-a',
                    action='store_true',
                    default=False,
                    help='Cleans up content models for articles.'),
        make_option('--book', '-b',
                    action='store',
                    default=False,
                    help='Cleans up content models for books.'),
        make_option('--force', '-f',
                    action='store_true',
                    default=False,
                    help='Updates even if SYMPLECTIC-ATOM has not been modified since last run.'),
        )

    def handle(self, *args, **options):

        self.verbosity = int(options['verbosity'])    # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1

        if options['article'] and options['book']:
            raise CommandError('Can not use both parameters')

        if not options['article'] and not options['book']:
            raise CommandError('Use at least one parameter')

        if options['article']:
            cmodel = Publication.ARTICLE_CONTENT_MODEL

        if options['book']:
           cmodel = Publication.BOOK_CONTENT_MODEL

        # connection to repository
        self.repo = ManagementRepository()
        pid_set = self.repo.get_objects_with_cmodel(cmodel, type=Publication)

        try:
            publications = Paginator(pid_set, 100)

        except Exception as e:
            self.output(0, "Error paginating items: : %s " % (e.message))

        #process all Articles
        for p in publications.page_range:
            try:
                objs = publications.page(p).object_list
            except Exception as e:
                #print error and go to next iteration of loop
                self.output(0,"Error getting page: %s : %s " % (p, e.message))
                continue
            for publication in objs:
                try:
                    if not publication.exists:
                        self.output(0, "Skipping %s because pid does not exist" % publication.pid)
                        continue
                    else:
                        if not publication.has_model(Publication.PUBLICATION_CONTENT_MODEL):
                            publication.add_relationship(relsextns.hasModel, Publication.PUBLICATION_CONTENT_MODEL)
                            publication.save()
                    else:
                        continue


                except Exception as e:
                    self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message))