Python solr_interface示例，openemory.util.solr_interface Python示例

示例#1

0

显示文件

def articles_by_tag(user, tag):
    '''Find articles in Solr based on a
    :class:`~django.contrib.auth.models.User` and their
    :class:`~openemory.accounts.models.Bookmark` s.

    Calls :meth:`pids_by_tag` to find the pids of bookmarked objects
    for the specified user and tag, and then queries Solr to get
    display information for those objects.
    '''
    solr = solr_interface()
    pidfilter = None
    # find any objects with pids bookmarked by the user
    # - generates a filter that looks like Q(pid=pid1) | Q(pid=pid2) | Q(pid=pid3)
    tagged_pids = pids_by_tag(user, tag)
    # if no pids are found, just return an empty list
    if not tagged_pids:
        return []
    for pid in tagged_pids:
        if pidfilter is None:
            pidfilter = solr.Q(pid=pid)
        else:
            pidfilter |= solr.Q(pid=pid)
    solrquery = solr.query(pidfilter) \
                        .field_limit(ARTICLE_VIEW_FIELDS) \
                        .sort_by('-last_modified') # best option ?

    # return solrquery instead of calling execute so the result can be
    # paginated
    return solrquery

示例#2

0

显示文件

文件： context_processors.py 项目： emory-libraries/OpenEmory

def statistics(request):
    '''`Template context processor
    <https://docs.djangoproject.com/en/dev/ref/settings/#template-context-processors>`_
    to add publication statistics to page context under the name
    ARTICLE_STATISTICS. The object has five properties: ``total_articles``,
    ``year_views``, ``year_downloads``, ``total_views``, and
    ``total_downloads``.'''

    solr_query = solr_interface().query() \
                                 .filter(content_model=Publication.ARTICLE_CONTENT_MODEL,
                                         state='A') \
                                 .paginate(rows=0)
    article_count = solr_query.execute().result.numFound
    stats = dict(total_articles=article_count)

    total_qs = ArticleStatistics.objects.all()
    total_stats = total_qs.aggregate(total_views=Sum('num_views'),
                                     total_downloads=Sum('num_downloads'))
    stats.update(total_stats)
    
    year_qs = ArticleStatistics.objects.filter(year=date.today().year)
    year_stats = year_qs.aggregate(year_views=Sum('num_views'),
                                   year_downloads=Sum('num_downloads'))
    stats.update(year_stats)

    return { 'ARTICLE_STATISTICS': stats }

示例#3

0

显示文件

文件： models.py 项目： emory-libraries/OpenEmory

def articles_by_tag(user, tag):
    """Find articles in Solr based on a
    :class:`~django.contrib.auth.models.User` and their
    :class:`~openemory.accounts.models.Bookmark` s.

    Calls :meth:`pids_by_tag` to find the pids of bookmarked objects
    for the specified user and tag, and then queries Solr to get
    display information for those objects.
    """
    solr = solr_interface()
    pidfilter = None
    # find any objects with pids bookmarked by the user
    # - generates a filter that looks like Q(pid=pid1) | Q(pid=pid2) | Q(pid=pid3)
    tagged_pids = pids_by_tag(user, tag)
    # if no pids are found, just return an empty list
    if not tagged_pids:
        return []
    for pid in tagged_pids:
        if pidfilter is None:
            pidfilter = solr.Q(pid=pid)
        else:
            pidfilter |= solr.Q(pid=pid)
    solrquery = solr.query(pidfilter).field_limit(PUBLICATION_VIEW_FIELDS).sort_by("-last_modified")  # best option ?

    # return solrquery instead of calling execute so the result can be
    # paginated
    return solrquery

示例#4

0

显示文件

文件： context_processors.py 项目： mprefer/OpenEmory

def statistics(request):
    '''`Template context processor
    <https://docs.djangoproject.com/en/dev/ref/settings/#template-context-processors>`_
    to add publication statistics to page context under the name
    ARTICLE_STATISTICS. The object has five properties: ``total_articles``,
    ``year_views``, ``year_downloads``, ``total_views``, and
    ``total_downloads``.'''

    solr_query = solr_interface().query() \
                                 .filter(content_model=Article.ARTICLE_CONTENT_MODEL,
                                         state='A') \
                                 .paginate(rows=0)
    article_count = solr_query.execute().result.numFound
    stats = dict(total_articles=article_count)

    total_qs = ArticleStatistics.objects.all()
    total_stats = total_qs.aggregate(total_views=Sum('num_views'),
                                     total_downloads=Sum('num_downloads'))
    stats.update(total_stats)

    year_qs = ArticleStatistics.objects.filter(year=date.today().year)
    year_stats = year_qs.aggregate(year_views=Sum('num_views'),
                                   year_downloads=Sum('num_downloads'))
    stats.update(year_stats)

    return {'ARTICLE_STATISTICS': stats}

示例#5

0

显示文件

文件： models.py 项目： mprefer/OpenEmory

    def _find_articles(self):
        '''Query Solr to find articles by this author.  Returns a solr
        query filtered by owner and content model, and fields limited
        to the standard view fields.

        Internal method with common functionality for
        :meth:`recent_articles` and :meth:`unpublished_articles`.

        '''
        solr = solr_interface()
        return solr.query(owner=self.user.username) \
                        .filter(content_model=Article.ARTICLE_CONTENT_MODEL) \
                        .field_limit(ARTICLE_VIEW_FIELDS)

示例#6

0

显示文件

    def _find_articles(self):
        '''Query Solr to find articles by this author.  Returns a solr
        query filtered by owner and content model, and fields limited
        to the standard view fields.

        Internal method with common functionality for
        :meth:`recent_articles` and :meth:`unpublished_articles`.

        '''
        solr = solr_interface()
        return solr.query(owner=self.user.username) \
                        .filter(content_model=Article.ARTICLE_CONTENT_MODEL) \
                        .field_limit(ARTICLE_VIEW_FIELDS)

示例#7

0

显示文件

文件： views.py 项目： emory-libraries/OpenEmory

def view_department(request, id):
    '''View a list of faculty (or non-faculty users with profiles) in a
    single department.

    :param id: department id
    '''
    # get a list of people by department code
    dep_id = id
    solr = solr_interface()
    people = solr.query(department_id=id) \
                 .filter(record_type=EsdPerson.record_type) \
                 .sort_by('last_name') \
                 .paginate(rows=150).execute()
    
    # filter = request.GET['filter'] if 'filter' in request.GET else ''

    # q = solr.query(department_id=id).filter(content_model=Article.ARTICLE_CONTENT_MODEL, state='A').facet_by('creator_sorting', mincount=1, limit=-1, sort='index', prefix=filter.lower())
    # result = q.paginate(rows=0).execute()
    # print result
    # facets = result.facet_counts.facet_fields['creator_sorting']

    # #removes name from field for proper presentation
    # facets = [(name.split("|")[1], count) for name, count in facets]


    if len(people):
        division = people[0]['division_name']
        depts = people[0]['department_name']
        # department_name is a list since an article can have 0..n. An
        # EsdPerson, though, only has one, so just grab the first.
        dept = depts[0] if depts else ''
        # shorten department name for display, since we have division
        # name for context
        if ':' in dept:
            dept = dept[dept.rfind(':')+1:].strip()

    else:
        # it's possible no profile users were found (unlikely with real data)
        # if no users were found, look up department code to get
        # division & department names
        deptinfo = EsdPerson.objects.filter(department_id=id)\
                    .only('department_name', 'division_name').distinct()
        # no department found for that id - 404
        if not deptinfo:
            raise Http404
        deptinfo = deptinfo[0]
        division = deptinfo.division_name
        dept = deptinfo.department_shortname

    return render(request, 'accounts/department.html',
                  {'esdpeople': people, 'department': dept, 'division': division, 'dep_id':dep_id})

示例#8

0

显示文件

文件： context_processors.py 项目： mprefer/OpenEmory

def statistics(request):
    '''`Template context processor
    <https://docs.djangoproject.com/en/dev/ref/settings/#template-context-processors>`_
    to add account and session statistics to page context under the name
    ACCOUNT_STATISTICS. The object currently has only one property:
    ``total_users``.'''

    solr_query = solr_interface().query() \
                                 .filter(record_type=EsdPerson.record_type) \
                                 .paginate(rows=0)
    faculty_count = solr_query.execute().result.numFound
    stats = {'total_users': faculty_count}

    return {'ACCOUNT_STATISTICS': stats}

示例#9

0

显示文件

文件： context_processors.py 项目： emory-libraries/OpenEmory

def statistics(request):
    '''`Template context processor
    <https://docs.djangoproject.com/en/dev/ref/settings/#template-context-processors>`_
    to add account and session statistics to page context under the name
    ACCOUNT_STATISTICS. The object currently has only one property:
    ``total_users``.'''

    solr_query = solr_interface().query() \
                                 .filter(record_type=EsdPerson.record_type) \
                                 .paginate(rows=0)
    faculty_count = solr_query.execute().result.numFound
    stats = { 'total_users': faculty_count }

    return { 'ACCOUNT_STATISTICS': stats }

示例#10

0

显示文件

文件： views.py 项目： mprefer/OpenEmory

def view_department(request, id):
    '''View a list of faculty (or non-faculty users with profiles) in a
    single department.

    :param id: department id
    '''
    # get a list of people by department code
    solr = solr_interface()
    people = solr.query(department_id=id) \
                 .filter(record_type=EsdPerson.record_type) \
                 .sort_by('last_name') \
                 .paginate(rows=150).execute()

    if len(people):
        division = people[0]['division_name']
        depts = people[0]['department_name']
        # department_name is a list since an article can have 0..n. An
        # EsdPerson, though, only has one, so just grab the first.
        dept = depts[0] if depts else ''
        # shorten department name for display, since we have division
        # name for context
        if ':' in dept:
            dept = dept[dept.rfind(':') + 1:].strip()

    else:
        # it's possible no profile users were found (unlikely with real data)
        # if no users were found, look up department code to get
        # division & department names
        deptinfo = EsdPerson.objects.filter(department_id=id)\
                    .only('department_name', 'division_name').distinct()
        # no department found for that id - 404
        if not deptinfo:
            raise Http404
        deptinfo = deptinfo[0]
        division = deptinfo.division_name
        dept = deptinfo.department_shortname

    return render(request, 'accounts/department.html', {
        'esdpeople': people,
        'department': dept,
        'division': division
    })

示例#11

0

显示文件

文件： views.py 项目： emory-libraries/OpenEmory

def faculty_autocomplete(request):
    term = request.GET.get('term', '')
    # handle multiple terms and strip off commas
    # e.g., if user searches for "lastname, firstname"
    terms = [t.strip(',') for t in term.lower().split() if t]
    # TODO: consider using eulcommon.searchutil here

    solr = solr_interface()
    # do an OR search for partial or exact matches in the full name
    term_filter = solr.Q()
    for t in terms:
        # exact match or partial match (exact word with * does not match)
        term_filter |= solr.Q(ad_name=t) | solr.Q(ad_name='%s*' % t)
    r = solr.query(term_filter).filter(record_type=EsdPerson.record_type) \
            .field_limit(['username', 'first_name',
                        'last_name', 'department_name',
                        'ad_name'], score=True) \
            .sort_by('-score').sort_by('ad_name_sort') \
            .paginate(rows=10).execute()

    # NOTE: may want to cut off based on some relevance score,
    # (e.g., if score is below 0.5 and there is at least one good match,
    # omit the less relevant items)
    
    # for u2 in r2:
    #     print u2

    print r
    suggestions = [
        {'label': u['ad_name'],  # directory name in lastname, firstname format
         'description': u.get('department_name', ''),  # may be suppressed
         'username': u['username'],
         # first name is missing in some cases-- don't error if it's not present
         # NOTE: if first name is missing, name may be listed/filled in wrong
         'first_name': u.get('first_name', ''),
         'last_name': u['last_name'],
         'affiliation': 'Emory University'}
         for u in r
        ]
    return  HttpResponse(json_serializer.encode(suggestions),
                         content_type='application/json')

示例#12

0

显示文件

文件： index_faculty.py 项目： emory-libraries/OpenEmory

    def handle(self, verbosity=1, *args, **options):

        self.verbosity = int(verbosity)

        if self.verbosity >= self.v_normal:
            print 'Indexing ESD data for %d faculty members in Solr' % \
                    (EsdPerson.faculty.all().count(),)

        try:
            solr_url = options.get('index_url', None)
            self.solr = solr_interface(solr_url)
        except socket.error as se:
            raise CommandError('Failed to connect to Solr (%s)' % se)

        try:
            self.update_faculty_index()
        except SolrError as se:
            if 'unknown field' in str(se):
                raise CommandError('Solr unknown field error ' +
                                   '(check that local schema matches running instance)')
            raise CommandError('Solr error (%s)' % se)

示例#13

0

显示文件

文件： views.py 项目： mprefer/OpenEmory

def departments(request):
    '''List department names based on Faculty information in ESD,
    grouped by division name.'''
    solr = solr_interface()
    div_dept_field = 'division_dept_id'
    r = solr.query(record_type=EsdPerson.record_type) \
            .facet_by(div_dept_field, limit=-1, sort='index') \
            .paginate(rows=0) \
            .execute()
    div_depts = r.facet_counts.facet_fields[div_dept_field]

    # division_dept_id field is indexed in Solr as
    # division_name|division_code|department_shortname|department_id
    # split out and convert to list of dict
    depts = []
    for d, total in div_depts:
        dept = EsdPerson.split_department(d)
        dept['total'] = total
        depts.append(dept)

    return render(request, 'accounts/departments.html', {'departments': depts})

示例#14

0

显示文件

文件： index_faculty.py 项目： mprefer/OpenEmory

    def handle(self, verbosity=1, *args, **options):

        self.verbosity = int(verbosity)

        if self.verbosity >= self.v_normal:
            print 'Indexing ESD data for %d faculty members in Solr' % \
                    (EsdPerson.faculty.all().count(),)

        try:
            solr_url = options.get('index_url', None)
            self.solr = solr_interface(solr_url)
        except socket.error as se:
            raise CommandError('Failed to connect to Solr (%s)' % se)

        try:
            self.update_faculty_index()
        except SolrError as se:
            if 'unknown field' in str(se):
                raise CommandError(
                    'Solr unknown field error ' +
                    '(check that local schema matches running instance)')
            raise CommandError('Solr error (%s)' % se)

示例#15

0

显示文件

文件： views.py 项目： emory-libraries/OpenEmory

def departments(request):
    '''List department names based on Faculty information in ESD,
    grouped by division name.'''
    solr = solr_interface()
    div_dept_field = 'division_dept_id'
    r = solr.query(record_type=EsdPerson.record_type) \
            .facet_by(div_dept_field, limit=-1, sort='index') \
            .paginate(rows=0) \
            .execute()
    div_depts = r.facet_counts.facet_fields[div_dept_field]

    # division_dept_id field is indexed in Solr as
    # division_name|division_code|department_shortname|department_id
    # split out and convert to list of dict
    depts = []
    for d, total in div_depts:
        dept = EsdPerson.split_department(d)
        dept['total'] = total
        depts.append(dept)

    return render(request, 'accounts/departments.html',
                  {'departments': depts})

示例#16

0

显示文件

文件： views.py 项目： mprefer/OpenEmory

def view_department(request, id):
    '''View a list of faculty (or non-faculty users with profiles) in a
    single department.

    :param id: department id
    '''
    # get a list of people by department code
    solr = solr_interface()
    people = solr.query(department_id=id) \
                 .filter(record_type=EsdPerson.record_type) \
                 .sort_by('last_name') \
                 .paginate(rows=150).execute()

    if len(people):
        division = people[0]['division_name']
        depts = people[0]['department_name']
        # department_name is a list since an article can have 0..n. An
        # EsdPerson, though, only has one, so just grab the first.
        dept = depts[0] if depts else ''
        # shorten department name for display, since we have division
        # name for context
        if ':' in dept:
            dept = dept[dept.rfind(':')+1:].strip()

    else:
        # it's possible no profile users were found (unlikely with real data)
        # if no users were found, look up department code to get
        # division & department names
        deptinfo = EsdPerson.objects.filter(department_id=id)\
                   	.only('department_name', 'division_name').distinct()
        # no department found for that id - 404
        if not deptinfo:
            raise Http404
        deptinfo = deptinfo[0]
        division = deptinfo.division_name
        dept = deptinfo.department_shortname

    return render(request, 'accounts/department.html',
                  {'esdpeople': people, 'department': dept, 'division': division})

示例#17

0

显示文件

文件： views.py 项目： mprefer/OpenEmory

def faculty_autocomplete(request):
    term = request.GET.get('term', '')
    # handle multiple terms and strip off commas
    # e.g., if user searches for "lastname, firstname"
    terms = [t.strip(',') for t in term.lower().split() if t]
    # TODO: consider using eulcommon.searchutil here

    solr = solr_interface()
    # do an OR search for partial or exact matches in the full name
    term_filter = solr.Q()
    for t in terms:
        # exact match or partial match (exact word with * does not match)
        term_filter |= solr.Q(ad_name=t) | solr.Q(ad_name='%s*' % t)
    r = solr.query(term_filter).filter(record_type=EsdPerson.record_type) \
            .field_limit(['username', 'first_name',
                        'last_name', 'department_name',
                        'ad_name'], score=True) \
            .sort_by('-score').sort_by('ad_name_sort') \
            .paginate(rows=10).execute()

    # NOTE: may want to cut off based on some relevance score,
    # (e.g., if score is below 0.5 and there is at least one good match,
    # omit the less relevant items)
    suggestions = [
        {
            'label':
            u['ad_name'],  # directory name in lastname, firstname format
            'description': u.get('department_name', ''),  # may be suppressed
            'username': u['username'],
            # first name is missing in some cases-- don't error if it's not present
            # NOTE: if first name is missing, name may be listed/filled in wrong
            'first_name': u.get('first_name', ''),
            'last_name': u['last_name'],
            'affiliation': 'Emory University'
        } for u in r
    ]
    return HttpResponse(json_serializer.encode(suggestions),
                        mimetype='application/json')

示例#18

0

显示文件

文件： expire_embargo.py 项目： mprefer/OpenEmory

    def handle(self, *args, **options):
        self.verbosity = int(options['verbosity'])    # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1

        #counters
        counts = defaultdict(int)

        #connection to repository
        #uses default user / pass configured in localsettings.py
        repo = Repository()

        #Connection to solr
        solr = solr_interface()

        #todays date in the same format as embargo_end date
        today = datetime.datetime.now().strftime("%Y-%m-%d")

        #if pids specified, use that list
        if len(args) != 0:
            pid_set = list(args)
            #convert list into dict so both solr and pid list formats are the same
            pid_set = [{'pid' : pid} for pid in pid_set]

        else:
            #search for active Articles with an embargo_end date less than today,
            # and that do not have fulltext field indexed. Only return the pid for each record.
            try:
                pid_set = solr.query().filter(content_model=Article.ARTICLE_CONTENT_MODEL,
                                                         state='A', embargo_end__lt=today).exclude(fulltext__any=True).\
                                                         field_limit('pid')

            except Exception as e:
                if 'is not a valid field name' in e.message:
                    raise CommandError('Solr unknown field error ' +
                                       '(check that local schema matches running instance)')
                raise CommandError('Error (%s)' % e.message)

        try:
            expired_embargoes = Paginator(pid_set, 20)
            counts['total'] = expired_embargoes.count
        except Exception as e:
            self.output(0, "Error paginating items: : %s " % (e.message))

        #process all expired embargoes
        for p in expired_embargoes.page_range:
            try:
                objs = expired_embargoes.page(p).object_list
            except Exception as e:
                #print error and go to next iteration of loop
                self.output(0,"Error getting page: %s : %s " % (p, e.message))
                counts['errors'] +=1
                continue
            for obj in objs:
                try:
                    article = repo.get_object(type=Article, pid=obj['pid'])
                    if not article.exists:
                        self.output(1, "Skipping %s because pid does not exist" % obj['pid'])
                        counts['skipped'] +=1
                        continue
                    #do not try to index items without valid fulltext field
                    data = article.index_data()
                    if 'fulltext' in data and data['fulltext'] != None and data['fulltext'].strip():
                        self.output(1,"Processing %s" % article.pid)
                        if not options['noact']:
                           solr.add(data)
                           counts['indexed'] +=1
                    else:
                        self.output(1, "Skipping %s because fulltext does not exist" % article.pid)
                        counts['skipped'] +=1
                except Exception as e:
                    self.output(0, "Error processing pid: %s : %s " % (obj['pid'], e.message))
                    counts['errors'] +=1

        # summarize what was done
        self.stdout.write("Total number selected: %s\n" % counts['total'])
        self.stdout.write("Indexed: %s\n" % counts['indexed'])
        self.stdout.write("Skipped: %s\n" % counts['skipped'])
        self.stdout.write("Errors: %s\n" % counts['errors'])

示例#19

0

显示文件

    def handle(self, *args, **options):
        self.verbosity = int(
            options['verbosity'])  # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1

        #counters for script reporting
        counts = defaultdict(int)

        #Connection to solr
        solr = solr_interface()

        #info today's date will be used to caculate previous quarter
        today = datetime.datetime.today()
        current_year = today.year
        current_month = today.month
        current_quarter = year_quarter(current_month)
        self.output(
            1, "Current month/year: %s/%s quarter: %s" %
            (current_month, current_year, current_quarter))

        if current_quarter == 1:
            self.year = current_year - 1
            self.quarter = 4
        else:
            self.year = current_year
            self.quarter = current_quarter - 1

        self.output(
            1, "Report will run for year: %s quarter: %s" %
            (self.year, self.quarter))

        #start and end dates
        start_end = {
            1: ('January 1, %s' % self.year, 'March 31, %s' % self.year),
            2: ('April 1, %s' % self.year, 'June 30, %s' % self.year),
            3: ('July 1, %s' % self.year, 'September 30, %s' % self.year),
            4: ('October 1, %s' % self.year, 'December 31, %s' % self.year),
        }

        #if netids specified, use that list
        if len(args) != 0:
            netid_set = list(args)

        else:
            #search for authors with published articles.
            try:
                netid_set = solr.query().filter(content_model=Article.ARTICLE_CONTENT_MODEL,state='A').\
                facet_by('owner').paginate(rows=0).execute()

            except Exception as e:
                if 'is not a valid field name' in e.message:
                    raise CommandError(
                        'Solr unknown field error ' +
                        '(check that local schema matches running instance)')
                raise CommandError('Error (%s)' % e.message)

            #get just the netid and thow away the count
            netid_set = netid_set.facet_counts.facet_fields['owner']
            netid_set = [n[0] for n in netid_set]

        #query solr for all articles for each user
        for n in netid_set:
            self.output(1, "Processing user %s" % n)
            try:
                article_query = solr.query().filter(
                    content_model=Article.ARTICLE_CONTENT_MODEL,
                    state='A',
                    owner=n).field_limit(['pid', 'title'])
                articles = Paginator(article_query, 5)  #change later
                articles = articles.object_list
            except Exception as e:
                self.output.error(0, e.message)
                continue

            article_data = self.get_article_data(articles, self.year,
                                                 self.quarter)

            #add name and email to article data
            user = User.objects.filter(username=n)
            if user:
                user = user[0]  # again should only be 1 record
                article_data['first_name'] = user.first_name
                article_data['last_name'] = user.last_name
                article_data['email'] = user.email
                article_data['start'] = start_end[self.quarter][0]
                article_data['end'] = start_end[self.quarter][1]

                #send the email!
                self.send_mail(article_data, options)

示例#20

0

显示文件

文件： sitemaps.py 项目： emory-libraries/OpenEmory

 def items(self):
     solr = solr_interface()
     r = solr.query(content_model=Publication.ARTICLE_CONTENT_MODEL,
                     state='A')
     return r

示例#21

0

显示文件

 def items(self):
     solr = solr_interface()
     r = solr.query(content_model=Article.ARTICLE_CONTENT_MODEL,
                     state='A')
     return r

示例#22

0

显示文件

文件： cleanup_articles.py 项目： mprefer/OpenEmory

    def handle(self, *args, **options):
        self.verbosity = int(options["verbosity"])  # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1

        # counters
        counts = defaultdict(int)

        # check required options
        if not options["username"]:
            raise CommandError("Username is required")
        else:
            if not options["password"] or options["password"] == "":
                options["password"] = getpass()

        # connection to repository
        repo = Repository(username=options["username"], password=options["password"])

        # Connection to solr
        solr = solr_interface()

        coll = repo.get_object(pid=settings.PID_ALIASES["oe-collection"])

        # if pids specified, use that list
        if len(args) != 0:
            pid_set = list(args)
            # convert list into dict so both solr and pid list formats are the same
            pid_set = [{"pid": pid} for pid in pid_set]

        else:
            # search for Articles. Only return the pid for each record.
            try:
                pid_set = solr.query().filter(content_model=Article.ARTICLE_CONTENT_MODEL).field_limit("pid")

            except Exception as e:
                if "is not a valid field name" in e.message:
                    raise CommandError(
                        "Solr unknown field error " + "(check that local schema matches running instance)"
                    )
                raise CommandError("Error (%s)" % e.message)

        try:
            articles = Paginator(pid_set, 20)
            counts["total"] = articles.count
        except Exception as e:
            self.output(0, "Error paginating items: : %s " % (e.message))

        # process all Articles
        for p in articles.page_range:
            try:
                objs = articles.page(p).object_list
            except Exception as e:
                # print error and go to next iteration of loop
                self.output(0, "Error getting page: %s : %s " % (p, e.message))
                counts["errors"] += 1
                continue
            for obj in objs:
                try:
                    article = repo.get_object(type=Article, pid=obj["pid"])
                    if not article.exists:
                        self.output(1, "Skipping %s because pid does not exist" % obj["pid"])
                        counts["skipped"] += 1
                        continue
                    else:
                        self.output(0, "Processing %s" % article.pid)

                        # clear out all access_conditions to prep for licens and copyright fields
                        article.descMetadata.content.access_conditions = []

                        # Remove contentMetadata if empty
                        if article.contentMetadata.exists and article.contentMetadata.content.is_empty():
                            if not options["noact"]:
                                article.api.purgeDatastream(
                                    article.pid, "contentMetadata", logMessage="Removing empty datastream"
                                )
                            self.output(1, "Removing empty contentMetadata datastream %s" % article.pid)
                            counts["removed"] += 1

                        elif article.contentMetadata.exists:
                            # Copy License info if available
                            if article.contentMetadata.content.license:
                                article.descMetadata.content.create_license()
                                article.descMetadata.content.license.text = article.contentMetadata.content.license.text
                                article.descMetadata.content.license.link = article.contentMetadata.content.license.link
                                self.output(1, "Copying license info to MODS %s" % article.pid)
                                counts["license"] += 1

                            # Copy License info from copyright secton if available and not in License section
                            elif (
                                article.contentMetadata.content.copyright
                                and "creative commons" in article.contentMetadata.content.copyright.lower()
                            ):
                                article.descMetadata.content.create_license()
                                article.descMetadata.content.license.text = article.contentMetadata.content.copyright
                                self.output(
                                    1, "Copying license info from Copyright section to MODS for %s" % article.pid
                                )
                                counts["copyright_license"] += 1

                            # Copy Copyright info if available
                            if article.contentMetadata.content.copyright:
                                article.descMetadata.content.create_copyright()
                                article.descMetadata.content.copyright.text = article.contentMetadata.content.copyright
                                self.output(1, "Copying copyright info to MODS %s" % article.pid)
                                counts["copyright"] += 1

                        # Add to collection
                        article.collection = coll
                        self.output(1, "Adding %s to collection %s" % (article.pid, coll.pid))
                        counts["collection"] += 1

                        # Add itemID for OAI
                        #                        if article.is_published:
                        #                            article.oai_itemID = "oai:ark:/25593/%s" % article.noid
                        #                            self.output(1, "Adding itemID to %s" % article.pid)
                        #                            counts['itemid']+= 1

                        # save article
                        if not options["noact"]:
                            article.save()
                except Exception as e:
                    self.output(0, "Error processing pid: %s : %s " % (obj["pid"], e.message))
                    counts["errors"] += 1

        # summarize what was done
        self.stdout.write("\n\n")
        self.stdout.write("Total number selected: %s\n" % counts["total"])
        self.stdout.write("Removed contentMetadata: %s\n" % counts["removed"])
        self.stdout.write("Updated License from License section: %s\n" % counts["license"])
        self.stdout.write("Updated License from Copyright section: %s\n" % counts["copyright_license"])
        self.stdout.write("Updated Copyright: %s\n" % counts["copyright"])
        self.stdout.write("Added to collection: %s\n" % counts["collection"])
        #        self.stdout.write("Added itemID: %s\n" % counts['itemid'])
        self.stdout.write("Skipped: %s\n" % counts["skipped"])
        self.stdout.write("Errors: %s\n" % counts["errors"])

示例#23

0

显示文件

文件： cleanup_articles.py 项目： mprefer/OpenEmory

    def handle(self, *args, **options):
        self.verbosity = int(
            options['verbosity'])  # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1

        #counters
        counts = defaultdict(int)

        # check required options
        if not options['username']:
            raise CommandError('Username is required')
        else:
            if not options['password'] or options['password'] == '':
                options['password'] = getpass()

        #connection to repository
        repo = Repository(username=options['username'],
                          password=options['password'])

        #Connection to solr
        solr = solr_interface()

        coll = repo.get_object(pid=settings.PID_ALIASES['oe-collection'])

        #if pids specified, use that list
        if len(args) != 0:
            pid_set = list(args)
            #convert list into dict so both solr and pid list formats are the same
            pid_set = [{'pid': pid} for pid in pid_set]

        else:
            #search for Articles. Only return the pid for each record.
            try:
                pid_set = solr.query().filter(
                    content_model=Article.ARTICLE_CONTENT_MODEL).field_limit(
                        'pid')

            except Exception as e:
                if 'is not a valid field name' in e.message:
                    raise CommandError(
                        'Solr unknown field error ' +
                        '(check that local schema matches running instance)')
                raise CommandError('Error (%s)' % e.message)

        try:
            articles = Paginator(pid_set, 20)
            counts['total'] = articles.count
        except Exception as e:
            self.output(0, "Error paginating items: : %s " % (e.message))

        #process all Articles
        for p in articles.page_range:
            try:
                objs = articles.page(p).object_list
            except Exception as e:
                #print error and go to next iteration of loop
                self.output(0, "Error getting page: %s : %s " % (p, e.message))
                counts['errors'] += 1
                continue
            for obj in objs:
                try:
                    article = repo.get_object(type=Article, pid=obj['pid'])
                    if not article.exists:
                        self.output(
                            1, "Skipping %s because pid does not exist" %
                            obj['pid'])
                        counts['skipped'] += 1
                        continue
                    else:
                        self.output(0, "Processing %s" % article.pid)

                        # clear out all access_conditions to prep for licens and copyright fields
                        article.descMetadata.content.access_conditions = []

                        # Remove contentMetadata if empty
                        if article.contentMetadata.exists and article.contentMetadata.content.is_empty(
                        ):
                            if not options['noact']:
                                article.api.purgeDatastream(
                                    article.pid,
                                    'contentMetadata',
                                    logMessage='Removing empty datastream')
                            self.output(
                                1,
                                "Removing empty contentMetadata datastream %s"
                                % article.pid)
                            counts['removed'] += 1

                        elif article.contentMetadata.exists:
                            # Copy License info if available
                            if article.contentMetadata.content.license:
                                article.descMetadata.content.create_license()
                                article.descMetadata.content.license.text = article.contentMetadata.content.license.text
                                article.descMetadata.content.license.link = article.contentMetadata.content.license.link
                                self.output(
                                    1, "Copying license info to MODS %s" %
                                    article.pid)
                                counts['license'] += 1

                            # Copy License info from copyright secton if available and not in License section
                            elif article.contentMetadata.content.copyright and \
                                 'creative commons' in article.contentMetadata.content.copyright.lower():
                                article.descMetadata.content.create_license()
                                article.descMetadata.content.license.text = article.contentMetadata.content.copyright
                                self.output(
                                    1,
                                    "Copying license info from Copyright section to MODS for %s"
                                    % article.pid)
                                counts['copyright_license'] += 1

                            # Copy Copyright info if available
                            if article.contentMetadata.content.copyright:
                                article.descMetadata.content.create_copyright()
                                article.descMetadata.content.copyright.text = article.contentMetadata.content.copyright
                                self.output(
                                    1, "Copying copyright info to MODS %s" %
                                    article.pid)
                                counts['copyright'] += 1

                        # Add to collection
                        article.collection = coll
                        self.output(
                            1, "Adding %s to collection %s" %
                            (article.pid, coll.pid))
                        counts['collection'] += 1

                        # Add itemID for OAI
                        #                        if article.is_published:
                        #                            article.oai_itemID = "oai:ark:/25593/%s" % article.noid
                        #                            self.output(1, "Adding itemID to %s" % article.pid)
                        #                            counts['itemid']+= 1

                        # save article
                        if not options['noact']:
                            article.save()
                except Exception as e:
                    self.output(
                        0, "Error processing pid: %s : %s " %
                        (obj['pid'], e.message))
                    counts['errors'] += 1

        # summarize what was done
        self.stdout.write("\n\n")
        self.stdout.write("Total number selected: %s\n" % counts['total'])
        self.stdout.write("Removed contentMetadata: %s\n" % counts['removed'])
        self.stdout.write("Updated License from License section: %s\n" %
                          counts['license'])
        self.stdout.write("Updated License from Copyright section: %s\n" %
                          counts['copyright_license'])
        self.stdout.write("Updated Copyright: %s\n" % counts['copyright'])
        self.stdout.write("Added to collection: %s\n" % counts['collection'])
        #        self.stdout.write("Added itemID: %s\n" % counts['itemid'])
        self.stdout.write("Skipped: %s\n" % counts['skipped'])
        self.stdout.write("Errors: %s\n" % counts['errors'])

示例#24

0

显示文件

    def handle(self, *args, **options):
        self.verbosity = int(
            options['verbosity'])  # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1

        #counters
        counts = defaultdict(int)

        #connection to repository
        #uses default user / pass configured in localsettings.py
        repo = Repository()

        #Connection to solr
        solr = solr_interface()

        #todays date in the same format as embargo_end date
        today = datetime.datetime.now().strftime("%Y-%m-%d")

        #if pids specified, use that list
        if len(args) != 0:
            pid_set = list(args)
            #convert list into dict so both solr and pid list formats are the same
            pid_set = [{'pid': pid} for pid in pid_set]

        else:
            #search for active Articles with an embargo_end date less than today,
            # and that do not have fulltext field indexed. Only return the pid for each record.
            try:
                pid_set = solr.query().filter(content_model=Article.ARTICLE_CONTENT_MODEL,
                                                         state='A', embargo_end__lt=today).exclude(fulltext__any=True).\
                                                         field_limit('pid')

            except Exception as e:
                if 'is not a valid field name' in e.message:
                    raise CommandError(
                        'Solr unknown field error ' +
                        '(check that local schema matches running instance)')
                raise CommandError('Error (%s)' % e.message)

        try:
            expired_embargoes = Paginator(pid_set, 20)
            counts['total'] = expired_embargoes.count
        except Exception as e:
            self.output(0, "Error paginating items: : %s " % (e.message))

        #process all expired embargoes
        for p in expired_embargoes.page_range:
            try:
                objs = expired_embargoes.page(p).object_list
            except Exception as e:
                #print error and go to next iteration of loop
                self.output(0, "Error getting page: %s : %s " % (p, e.message))
                counts['errors'] += 1
                continue
            for obj in objs:
                try:
                    article = repo.get_object(type=Article, pid=obj['pid'])
                    if not article.exists:
                        self.output(
                            1, "Skipping %s because pid does not exist" %
                            obj['pid'])
                        counts['skipped'] += 1
                        continue
                    #do not try to index items without valid fulltext field
                    data = article.index_data()
                    if 'fulltext' in data and data[
                            'fulltext'] != None and data['fulltext'].strip():
                        self.output(1, "Processing %s" % article.pid)
                        if not options['noact']:
                            solr.add(data)
                            counts['indexed'] += 1
                    else:
                        self.output(
                            1, "Skipping %s because fulltext does not exist" %
                            article.pid)
                        counts['skipped'] += 1
                except Exception as e:
                    self.output(
                        0, "Error processing pid: %s : %s " %
                        (obj['pid'], e.message))
                    counts['errors'] += 1

        # summarize what was done
        self.stdout.write("Total number selected: %s\n" % counts['total'])
        self.stdout.write("Indexed: %s\n" % counts['indexed'])
        self.stdout.write("Skipped: %s\n" % counts['skipped'])
        self.stdout.write("Errors: %s\n" % counts['errors'])

示例#25

0

显示文件

文件： quarterly_stats_by_author.py 项目： emory-libraries/OpenEmory

    def handle(self, *args, **options):
        self.verbosity = int(options['verbosity'])    # 1 = normal, 0 = minimal, 2 = all
        self.v_normal = 1

        #counters for script reporting
        counts = defaultdict(int)

        #Connection to solr
        solr = solr_interface()

        #info today's date will be used to caculate previous quarter
        today = datetime.datetime.today()
        current_year = today.year
        current_month = today.month
        current_quarter = year_quarter(current_month)
        self.output(1, "Current month/year: %s/%s quarter: %s" % (current_month, current_year, current_quarter))

        if current_quarter == 1:
            self.year = current_year -1
            self.quarter = 4
        else:
            self.year = current_year
            self.quarter = current_quarter - 1

        self.output(1, "Report will run for year: %s quarter: %s" % (self.year, self.quarter))

        #start and end dates
        start_end = {
            1 : ('January 1, %s' % self.year, 'March 31, %s' % self.year),
            2 : ('April 1, %s' % self.year, 'June 30, %s' % self.year),
            3 : ('July 1, %s' % self.year, 'September 30, %s' % self.year),
            4 : ('October 1, %s' % self.year, 'December 31, %s' % self.year),
        }




        #if netids specified, use that list
        if len(args) != 0:
            netid_set = list(args)

        else:
            #search for authors with published articles.
            try:
                netid_set = solr.query().filter(content_model=Publication.ARTICLE_CONTENT_MODEL,state='A').\
                facet_by('owner').paginate(rows=0).execute()

            except Exception as e:
                if 'is not a valid field name' in e.message:
                    raise CommandError('Solr unknown field error ' +
                                       '(check that local schema matches running instance)')
                raise CommandError('Error (%s)' % e.message)

            #get just the netid and thow away the count
            netid_set = netid_set.facet_counts.facet_fields['owner']
            netid_set = [n[0] for n in netid_set]

        #query solr for all articles for each user
        for n in netid_set:
            self.output(1, "Processing user %s" % n)
            try:
                article_query = solr.query().filter(content_model=Publication.ARTICLE_CONTENT_MODEL,state='A' ,
                                                 owner=n).field_limit(['pid', 'title'])
                articles = Paginator(article_query, 5) #change later
                articles = articles.object_list
            except Exception as e:
                self.output.error(0, e.message)
                continue

            article_data = self.get_article_data(articles, self.year, self.quarter)

            #add name and email to article data
            user = User.objects.filter(username=n)
            if user:
                user = user[0] # again should only be 1 record
                article_data['first_name'] = user.first_name
                article_data['last_name'] = user.last_name
                article_data['email'] = user.email
                article_data['start'] = start_end[self.quarter][0]
                article_data['end'] = start_end[self.quarter][1]

                #send the email!
                self.send_mail(article_data, options)