示例#1
0
文件: tasks.py 项目: wejhink/MyJobs
def task_reindex_solr(solr_location=None):
    """
    Adds all ProfileUnits, Users, and SavedSearches to solr.

    Inputs:
    :solr_location: Dict of separate cores to be updated (Optional);
        defaults to the default instance from settings
    """
    if solr_location is None:
        solr_location = settings.SOLR
    l = []

    u = User.objects.all().values_list('id', flat=True)
    for x in u:
        l.append(profileunits_to_dict(x))

    s = SavedSearch.objects.filter(user__isnull=False)
    for x in s:
        saved_search_dict = object_to_dict(SavedSearch, x)
        saved_search_dict['doc_type'] = 'savedsearch'
        l.append(saved_search_dict)

    u = User.objects.all()
    for x in u:
        l.append(object_to_dict(User, x))

    l = split_list(l, 1000)

    for location in solr_location.values():
        solr = pysolr.Solr(location)
        for x in l:
            x = filter(None, list(x))
            solr.add(x)
示例#2
0
文件: tasks.py 项目: wejhink/MyJobs
def update_solr_task(solr_location=None):
    """
    Deletes all items scheduled for deletion, and then adds all items
    scheduled to be added to solr.

    Inputs:
    :solr_location: Dict of separate cores to be updated
    """
    if hasattr(mail, 'outbox'):
        solr_location = settings.TEST_SOLR_INSTANCE
    elif solr_location is None:
        solr_location = settings.SOLR
    objs = Update.objects.filter(delete=True).values_list('uid', flat=True)

    if objs:
        objs = split_list(objs, 1000)
        for obj_list in objs:
            obj_list = filter(None, list(obj_list))
            uid_list = " OR ".join(obj_list)
            for location in solr_location.values():
                solr = pysolr.Solr(location)
                solr.delete(q="uid:(%s)" % uid_list)
        Update.objects.filter(delete=True).delete()

    objs = Update.objects.filter(delete=False)
    updates = []

    for obj in objs:
        content_type, key = obj.uid.split("##")
        model = ContentType.objects.get_for_id(content_type).model_class()
        if model == SavedSearch:
            search = model.objects.get(pk=key)
            # Saved search recipients can currently be null; Displaying these
            # searches may be implemented in the future but will likely require
            # some template work.
            if search.user:
                updates.append(object_to_dict(model, search))
        # If the user is being updated, because the user is stored on the
        # SavedSearch document, every SavedSearch belonging to that user
        # also has to be updated.
        elif model == User:
            searches = SavedSearch.objects.filter(user_id=key)
            [updates.append(object_to_dict(SavedSearch, s)) for s in searches]
            updates.append(object_to_dict(model, model.objects.get(pk=key)))
        else:
            updates.append(profileunits_to_dict(key))

    updates = split_list(updates, 1000)
    for location in solr_location.values():
        solr = pysolr.Solr(location)
        for update_subset in updates:
            update_subset = filter(None, list(update_subset))
            solr.add(list(update_subset))
    objs.delete()
示例#3
0
    def test_user_to_dict(self):
        """
        Confirms that a solr dictionary is being generated as expected by
        the object_to_dict function for Users.

        """
        user = UserFactory(email="*****@*****.**")
        content_type = ContentType.objects.get_for_model(User)
        expected = {
            'User_is_superuser': False,
            u'User_id': 1,
            'uid': '%s##%s' % (str(content_type.pk), str(user.pk)),
            'User_is_active': True,
            'User_user_guid': 'c1cf679c-86f8-4bce-bf1a-ade8341cd3c1',
            'User_is_staff': False, 'User_first_name': u'',
            'User_gravatar': '*****@*****.**',
            'User_last_name': u'',
            'User_is_disabled': False,
            'User_opt_in_myjobs': True,
            'User_profile_completion': 0,
            'User_opt_in_employers': True,
            'User_email': '*****@*****.**',
        }

        result = object_to_dict(User, user)

        # Exact dictionary comparisons can't be made because of the datetime
        # fields, so compare a few fields instead.
        self.assertEqual(expected['uid'], result['uid'])
        self.assertEqual(expected['User_email'], result['User_email'])
示例#4
0
    def test_savedsearch_no_user(self):
        """
        A saved search with a null recipient should never be inserted in Solr
        """
        solr = Solr()
        SavedSearchFactory(user=self.user)
        update_solr_task(self.test_solr)
        results = solr.search(q='*:*')
        # One hit for the search, one for its recipient
        self.assertEqual(results.hits, 2)

        solr.delete()
        search = SavedSearchFactory(user=None)
        self.assertEqual(object_to_dict(SavedSearch, search), None)
        update_solr_task(self.test_solr)
        results = solr.search(q='*:*')
        self.assertEqual(results.hits, 0)
示例#5
0
    def test_savedsearch_to_dict(self):
        """
        Confirms that a solr dictionary is being generated as expected by
        the object_to_dict function for SavedSearch.

        """
        user = UserFactory(email="*****@*****.**")
        search = SavedSearchFactory(user=user)
        content_type = ContentType.objects.get_for_model(SavedSearch)
        expected = {'User_is_superuser': False,
                    'uid': '%s##%s' % (str(content_type.pk), str(search.pk)),
                    'User_is_staff': False,
                    'SavedSearch_day_of_month': None,
                    'User_is_disabled': False,
                    'SavedSearch_last_sent': None,
                    'User_email': '*****@*****.**',
                    'SavedSearch_feed': 'http://www.my.jobs/jobs/feed/rss?',
                    'SavedSearch_is_active': True,
                    'SavedSearch_label': 'All Jobs',
                    'User_user_guid': '9ba19d0d-6ee1-4032-a2b8-50a1fc4c1ab5',
                    u'SavedSearch_id': 1,
                    'SavedSearch_email': '*****@*****.**',
                    'SavedSearch_notes': 'All jobs from www.my.jobs',
                    'SavedSearch_frequency': 'W', u'User_id': 1,
                    'User_gravatar': '*****@*****.**',
                    'User_last_name': u'',
                    'SavedSearch_user_id': 1,
                    'User_opt_in_myjobs': True,
                    'User_profile_completion': 0,
                    'SavedSearch_day_of_week': '1',
                    'User_is_active': True,
                    'User_first_name': u'',
                    'SavedSearch_url': 'http://www.my.jobs/jobs',
                    'User_opt_in_employers': True,
                    'SavedSearch_sort_by': 'Relevance'
        }

        result = object_to_dict(SavedSearch, search)

        # Exact dictionary comparisons can't be made because of the datetime
        # fields, so compare a few fields instead.
        self.assertEqual(expected['uid'], result['uid'])
        self.assertEqual(expected['User_email'], result['User_email'])
        self.assertEqual(expected['SavedSearch_url'], result['SavedSearch_url'])
示例#6
0
文件: tasks.py 项目: wejhink/MyJobs
def parse_log(logs, solr_location):
    """
    Turns a list of boto keys into a list of dicts, with each dict representing
    a line from the keys

    Inputs:
    :logs: List of logs generated by boto that reference files on s3
        Lines in analytics logs are formatted as follows:
            %{%Y-%m-%d %H:%M:%S}t %a %m %U %q %H %s %{Referer}i %{aguid}C
                %{myguid}C %{user-agent}i
        Lines in redirect logs are formatted slightly differently:
            %{%Y-%m-%d %H:%M:%S}t %a %m %U %{X-REDIRECT}o %p %u %{X-Real-IP}i
                %H "%{User-agent}i" %{r.my.jobs}C %{Referer}i %V %>s %O %I %D

    :solr_location: Dict of separate cores to be updated (Optional);
        defaults to the default instance from settings
    """
    # Logs are potentially very large. If we are going to look up the company
    # associated with each hit, we should memoize the ids.
    log_memo = {}

    for log in logs:
        to_solr = []
        path = '/tmp/parse_log'
        # Ensure local temp storage for log files exists
        try:
            os.mkdir(path)
        except OSError:
            if not os.path.isdir(path):
                raise
        f = open('%s/%s' % (path, uuid.uuid4().hex), 'w+')
        try:
            log.get_contents_to_file(f)
            f.seek(0)

            for line in f:
                if line[0] == '#':
                    # Logs contain a header that LogParser uses to determine
                    # the log format; if we see this, ignore it
                    continue

                # line in f does not strip newlines if they exist
                line = line.rstrip('\n')
                line = line.split(' ')

                # reconstruct date and time
                line[0] = '%s %s' % (line[0], line[1])
                # turn date and time into a datetime object
                line[0] = datetime.strptime(line[0], '%Y-%m-%d %H:%M:%S')
                # remove the time portion, which is now merged with the date
                del line[1]

                # reconstruct user agent
                # and remove it from the line
                if 'redirect' in log.key:
                    ua = ' '.join(line[9:-7])
                    del line[9:-7]
                else:
                    ua = line[8]
                    del line[8]

                if not helpers.is_bot(ua):
                    # Only track hits that come from actual users
                    update_dict = {
                        'view_date': line[0],
                        'doc_type': 'analytics',
                    }

                    # Make sure the value for a given key is only a list if
                    # there are multiple elements
                    qs = dict((k, v if len(v) > 1 else v[0])
                              for k, v in urlparse.parse_qs(
                                  line[4]).iteritems())

                    if 'redirect' in log.key:
                        aguid = qs.get('jcnlx.aguid', '')
                        myguid = qs.get('jcnlx.myguid', '')
                        update_dict['view_source'] = qs.get('jcnlx.vsid', 0)
                        update_dict['job_view_buid'] = qs.get('jcnlx.buid', '0')

                        # GUID is the path portion of this line, which starts
                        # with a '/'; Remove it
                        update_dict['job_view_guid'] = line[3][1:]
                        update_dict['page_category'] = 'redirect'
                        domain = qs.get('jcnlx.ref', '')
                        domain = urlparse.urlparse(domain).netloc
                        update_dict['domain'] = domain
                    else:
                        aguid = qs.get('aguid', '')
                        myguid = qs.get('myguid', '')
                        update_dict['view_source'] = qs.get('jvs', 0)
                        update_dict['job_view_buid'] = qs.get('jvb', '0')
                        update_dict['job_view_guid'] = qs.get('jvg', '')
                        update_dict['page_category'] = qs.get('pc', '')

                        # These fields are only set in analytics logs
                        update_dict['domain'] = qs.get('d', '')
                        update_dict['facets'] = qs.get('f', '')
                        update_dict['job_view_title_exact'] = qs.get('jvt', '')
                        update_dict['job_view_company_exact'] = qs.get('jvc', '')
                        update_dict['job_view_location_exact'] = qs.get('jvl', '')
                        update_dict['job_view_canonical_domain'] = qs.get('jvcd', '')
                        update_dict['search_location'] = qs.get('sl', '')
                        update_dict['search_query'] = qs.get('sq', '')
                        update_dict['site_tag'] = qs.get('st', '')
                        update_dict['special_commitment'] = qs.get('sc', '')

                    # Handle logs containing the old aguid/myguid formats
                    aguid = aguid.replace('{', '').replace('}', '').replace('-', '')
                    update_dict['aguid'] = aguid

                    myguid = myguid.replace('-', '')

                    if myguid:
                        try:
                            user = User.objects.get(user_guid=myguid)
                        except User.DoesNotExist:
                            update_dict['User_user_guid'] = ''
                        else:
                            update_dict.update(object_to_dict(User, user))

                    buid = update_dict['job_view_buid']
                    domain = update_dict.get('domain', None)
                    if not (buid in log_memo or domain in log_memo):
                        # We haven't seen this buid or domain before
                        if buid == '0' and domain is not None:
                            # Retrieve company id via domain
                            try:
                                site = SeoSite.objects.get(domain=domain)
                                company_id = site.business_units.values_list(
                                    'company__pk', flat=True)[0]
                            except (SeoSite.DoesNotExist,
                                    IndexError):
                                # SeoSite.DoesNotExist: Site does not exist
                                #   with the given domain
                                # IndexError: SeoSite exists, but is not
                                #   associated with business units or companies
                                company_id = 999999
                            key = domain
                        else:
                            # Retrieve company id via buid
                            try:
                                # See if there is a company associated with it
                                company_id = Company.objects.filter(
                                    job_source_ids=buid)[0].pk
                            except IndexError:
                                # There is not; default to DirectEmployers
                                # Association
                                company_id = 999999
                            key = buid

                        # The defining feature of a given document will either
                        # be the domain or the buid.
                        # Our memoization dict will have the following structure
                        # {str(buid): int(company_id),
                        #  str(domain): int(company_id)}
                        log_memo[key] = company_id

                    # By this point, we are guaranteed that the correct key is
                    # in log_memo; pull the company id from the memo dict.
                    if domain is not None and domain in log_memo:
                        update_dict['company_id'] = log_memo[domain]
                    else:
                        update_dict['company_id'] = log_memo[buid]

                    update_dict['uid'] = 'analytics##%s#%s' % \
                                         (update_dict['view_date'], aguid)
                    to_solr.append(update_dict)
        except Exception:
            # There may be more logs to process, don't propagate the exception
            pass
        finally:
            # remove the file from the filesystem to ensure we don't fill the
            # drive (again)
            f.close()
            os.remove(f.name)

        # Ensure all hits get recorded by breaking a potentially massive list
        # down into something that solr can manage
        subsets = split_list(to_solr, 500)
        for location in solr_location.values():
            solr = pysolr.Solr(location)
            for subset in subsets:
                try:
                    subset = filter(None, subset)
                    solr.add(subset)
                except pysolr.SolrError:
                    # There is something wrong with this chunk of data. It's
                    # better to lose 500 documents than the entire file
                    pass