def task_reindex_solr(solr_location=None): """ Adds all ProfileUnits, Users, and SavedSearches to solr. Inputs: :solr_location: Dict of separate cores to be updated (Optional); defaults to the default instance from settings """ if solr_location is None: solr_location = settings.SOLR l = [] u = User.objects.all().values_list('id', flat=True) for x in u: l.append(profileunits_to_dict(x)) s = SavedSearch.objects.filter(user__isnull=False) for x in s: saved_search_dict = object_to_dict(SavedSearch, x) saved_search_dict['doc_type'] = 'savedsearch' l.append(saved_search_dict) u = User.objects.all() for x in u: l.append(object_to_dict(User, x)) l = split_list(l, 1000) for location in solr_location.values(): solr = pysolr.Solr(location) for x in l: x = filter(None, list(x)) solr.add(x)
def update_solr_task(solr_location=None): """ Deletes all items scheduled for deletion, and then adds all items scheduled to be added to solr. Inputs: :solr_location: Dict of separate cores to be updated """ if hasattr(mail, 'outbox'): solr_location = settings.TEST_SOLR_INSTANCE elif solr_location is None: solr_location = settings.SOLR objs = Update.objects.filter(delete=True).values_list('uid', flat=True) if objs: objs = split_list(objs, 1000) for obj_list in objs: obj_list = filter(None, list(obj_list)) uid_list = " OR ".join(obj_list) for location in solr_location.values(): solr = pysolr.Solr(location) solr.delete(q="uid:(%s)" % uid_list) Update.objects.filter(delete=True).delete() objs = Update.objects.filter(delete=False) updates = [] for obj in objs: content_type, key = obj.uid.split("##") model = ContentType.objects.get_for_id(content_type).model_class() if model == SavedSearch: search = model.objects.get(pk=key) # Saved search recipients can currently be null; Displaying these # searches may be implemented in the future but will likely require # some template work. if search.user: updates.append(object_to_dict(model, search)) # If the user is being updated, because the user is stored on the # SavedSearch document, every SavedSearch belonging to that user # also has to be updated. elif model == User: searches = SavedSearch.objects.filter(user_id=key) [updates.append(object_to_dict(SavedSearch, s)) for s in searches] updates.append(object_to_dict(model, model.objects.get(pk=key))) else: updates.append(profileunits_to_dict(key)) updates = split_list(updates, 1000) for location in solr_location.values(): solr = pysolr.Solr(location) for update_subset in updates: update_subset = filter(None, list(update_subset)) solr.add(list(update_subset)) objs.delete()
def test_user_to_dict(self): """ Confirms that a solr dictionary is being generated as expected by the object_to_dict function for Users. """ user = UserFactory(email="*****@*****.**") content_type = ContentType.objects.get_for_model(User) expected = { 'User_is_superuser': False, u'User_id': 1, 'uid': '%s##%s' % (str(content_type.pk), str(user.pk)), 'User_is_active': True, 'User_user_guid': 'c1cf679c-86f8-4bce-bf1a-ade8341cd3c1', 'User_is_staff': False, 'User_first_name': u'', 'User_gravatar': '*****@*****.**', 'User_last_name': u'', 'User_is_disabled': False, 'User_opt_in_myjobs': True, 'User_profile_completion': 0, 'User_opt_in_employers': True, 'User_email': '*****@*****.**', } result = object_to_dict(User, user) # Exact dictionary comparisons can't be made because of the datetime # fields, so compare a few fields instead. self.assertEqual(expected['uid'], result['uid']) self.assertEqual(expected['User_email'], result['User_email'])
def test_savedsearch_no_user(self): """ A saved search with a null recipient should never be inserted in Solr """ solr = Solr() SavedSearchFactory(user=self.user) update_solr_task(self.test_solr) results = solr.search(q='*:*') # One hit for the search, one for its recipient self.assertEqual(results.hits, 2) solr.delete() search = SavedSearchFactory(user=None) self.assertEqual(object_to_dict(SavedSearch, search), None) update_solr_task(self.test_solr) results = solr.search(q='*:*') self.assertEqual(results.hits, 0)
def test_savedsearch_to_dict(self): """ Confirms that a solr dictionary is being generated as expected by the object_to_dict function for SavedSearch. """ user = UserFactory(email="*****@*****.**") search = SavedSearchFactory(user=user) content_type = ContentType.objects.get_for_model(SavedSearch) expected = {'User_is_superuser': False, 'uid': '%s##%s' % (str(content_type.pk), str(search.pk)), 'User_is_staff': False, 'SavedSearch_day_of_month': None, 'User_is_disabled': False, 'SavedSearch_last_sent': None, 'User_email': '*****@*****.**', 'SavedSearch_feed': 'http://www.my.jobs/jobs/feed/rss?', 'SavedSearch_is_active': True, 'SavedSearch_label': 'All Jobs', 'User_user_guid': '9ba19d0d-6ee1-4032-a2b8-50a1fc4c1ab5', u'SavedSearch_id': 1, 'SavedSearch_email': '*****@*****.**', 'SavedSearch_notes': 'All jobs from www.my.jobs', 'SavedSearch_frequency': 'W', u'User_id': 1, 'User_gravatar': '*****@*****.**', 'User_last_name': u'', 'SavedSearch_user_id': 1, 'User_opt_in_myjobs': True, 'User_profile_completion': 0, 'SavedSearch_day_of_week': '1', 'User_is_active': True, 'User_first_name': u'', 'SavedSearch_url': 'http://www.my.jobs/jobs', 'User_opt_in_employers': True, 'SavedSearch_sort_by': 'Relevance' } result = object_to_dict(SavedSearch, search) # Exact dictionary comparisons can't be made because of the datetime # fields, so compare a few fields instead. self.assertEqual(expected['uid'], result['uid']) self.assertEqual(expected['User_email'], result['User_email']) self.assertEqual(expected['SavedSearch_url'], result['SavedSearch_url'])
def parse_log(logs, solr_location): """ Turns a list of boto keys into a list of dicts, with each dict representing a line from the keys Inputs: :logs: List of logs generated by boto that reference files on s3 Lines in analytics logs are formatted as follows: %{%Y-%m-%d %H:%M:%S}t %a %m %U %q %H %s %{Referer}i %{aguid}C %{myguid}C %{user-agent}i Lines in redirect logs are formatted slightly differently: %{%Y-%m-%d %H:%M:%S}t %a %m %U %{X-REDIRECT}o %p %u %{X-Real-IP}i %H "%{User-agent}i" %{r.my.jobs}C %{Referer}i %V %>s %O %I %D :solr_location: Dict of separate cores to be updated (Optional); defaults to the default instance from settings """ # Logs are potentially very large. If we are going to look up the company # associated with each hit, we should memoize the ids. log_memo = {} for log in logs: to_solr = [] path = '/tmp/parse_log' # Ensure local temp storage for log files exists try: os.mkdir(path) except OSError: if not os.path.isdir(path): raise f = open('%s/%s' % (path, uuid.uuid4().hex), 'w+') try: log.get_contents_to_file(f) f.seek(0) for line in f: if line[0] == '#': # Logs contain a header that LogParser uses to determine # the log format; if we see this, ignore it continue # line in f does not strip newlines if they exist line = line.rstrip('\n') line = line.split(' ') # reconstruct date and time line[0] = '%s %s' % (line[0], line[1]) # turn date and time into a datetime object line[0] = datetime.strptime(line[0], '%Y-%m-%d %H:%M:%S') # remove the time portion, which is now merged with the date del line[1] # reconstruct user agent # and remove it from the line if 'redirect' in log.key: ua = ' '.join(line[9:-7]) del line[9:-7] else: ua = line[8] del line[8] if not helpers.is_bot(ua): # Only track hits that come from actual users update_dict = { 'view_date': line[0], 'doc_type': 'analytics', } # Make sure the value for a given key is only a list if # there are multiple elements qs = dict((k, v if len(v) > 1 else v[0]) for k, v in urlparse.parse_qs( line[4]).iteritems()) if 'redirect' in log.key: aguid = qs.get('jcnlx.aguid', '') myguid = qs.get('jcnlx.myguid', '') update_dict['view_source'] = qs.get('jcnlx.vsid', 0) update_dict['job_view_buid'] = qs.get('jcnlx.buid', '0') # GUID is the path portion of this line, which starts # with a '/'; Remove it update_dict['job_view_guid'] = line[3][1:] update_dict['page_category'] = 'redirect' domain = qs.get('jcnlx.ref', '') domain = urlparse.urlparse(domain).netloc update_dict['domain'] = domain else: aguid = qs.get('aguid', '') myguid = qs.get('myguid', '') update_dict['view_source'] = qs.get('jvs', 0) update_dict['job_view_buid'] = qs.get('jvb', '0') update_dict['job_view_guid'] = qs.get('jvg', '') update_dict['page_category'] = qs.get('pc', '') # These fields are only set in analytics logs update_dict['domain'] = qs.get('d', '') update_dict['facets'] = qs.get('f', '') update_dict['job_view_title_exact'] = qs.get('jvt', '') update_dict['job_view_company_exact'] = qs.get('jvc', '') update_dict['job_view_location_exact'] = qs.get('jvl', '') update_dict['job_view_canonical_domain'] = qs.get('jvcd', '') update_dict['search_location'] = qs.get('sl', '') update_dict['search_query'] = qs.get('sq', '') update_dict['site_tag'] = qs.get('st', '') update_dict['special_commitment'] = qs.get('sc', '') # Handle logs containing the old aguid/myguid formats aguid = aguid.replace('{', '').replace('}', '').replace('-', '') update_dict['aguid'] = aguid myguid = myguid.replace('-', '') if myguid: try: user = User.objects.get(user_guid=myguid) except User.DoesNotExist: update_dict['User_user_guid'] = '' else: update_dict.update(object_to_dict(User, user)) buid = update_dict['job_view_buid'] domain = update_dict.get('domain', None) if not (buid in log_memo or domain in log_memo): # We haven't seen this buid or domain before if buid == '0' and domain is not None: # Retrieve company id via domain try: site = SeoSite.objects.get(domain=domain) company_id = site.business_units.values_list( 'company__pk', flat=True)[0] except (SeoSite.DoesNotExist, IndexError): # SeoSite.DoesNotExist: Site does not exist # with the given domain # IndexError: SeoSite exists, but is not # associated with business units or companies company_id = 999999 key = domain else: # Retrieve company id via buid try: # See if there is a company associated with it company_id = Company.objects.filter( job_source_ids=buid)[0].pk except IndexError: # There is not; default to DirectEmployers # Association company_id = 999999 key = buid # The defining feature of a given document will either # be the domain or the buid. # Our memoization dict will have the following structure # {str(buid): int(company_id), # str(domain): int(company_id)} log_memo[key] = company_id # By this point, we are guaranteed that the correct key is # in log_memo; pull the company id from the memo dict. if domain is not None and domain in log_memo: update_dict['company_id'] = log_memo[domain] else: update_dict['company_id'] = log_memo[buid] update_dict['uid'] = 'analytics##%s#%s' % \ (update_dict['view_date'], aguid) to_solr.append(update_dict) except Exception: # There may be more logs to process, don't propagate the exception pass finally: # remove the file from the filesystem to ensure we don't fill the # drive (again) f.close() os.remove(f.name) # Ensure all hits get recorded by breaking a potentially massive list # down into something that solr can manage subsets = split_list(to_solr, 500) for location in solr_location.values(): solr = pysolr.Solr(location) for subset in subsets: try: subset = filter(None, subset) solr.add(subset) except pysolr.SolrError: # There is something wrong with this chunk of data. It's # better to lose 500 documents than the entire file pass