def rewrite_result(result): '''\ Rewrites the HTML in this result (question, answers and comments) so links to other StackExchange sites that exist in Stackdump are rewritten, links elsewhere are decorated with a CSS class, and all images are replaced with a placeholder. The JSON must have been decoded first. ''' app_url_root = settings.APP_URL_ROOT # get a list of all the site base URLs sites = list(Site.select()) sites_by_urls = dict([ (s.base_url, s) for s in sites ]) # rewrite question question = result.get('question') if question: question['body'] = _rewrite_html(question.get('body'), app_url_root, sites_by_urls) for c in question.get('comments', [ ]): c['text'] = _rewrite_html(c.get('text'), app_url_root, sites_by_urls) # rewrite answers answers = result.get('answers') if answers: for a in answers: a['body'] = _rewrite_html(a.get('body'), app_url_root, sites_by_urls) for c in a.get('comments', [ ]): c['text'] = _rewrite_html(c.get('text'), app_url_root, sites_by_urls)
def save_to_db(self, dic): assert all(map(dic.has_key, ['title', 'original_price', 'price', 'detail', 'url'])),\ "Information incomplete." url = dic['url'] original_price = dic['original_price'].text.encode('utf8') price = dic['price'].text.encode('utf8') title = dic['title'].text # title is unicode detail = dic['detail'].renderContents(encoding='utf8') detail = utils.clean_detail(detail, self.home_url) # Data formatting & validation. try: original_price, price = map(lambda s: int(re.search(r'(\d+)', s).group()), [original_price, price]) except TypeError: logging.error("Price conversion failed. Detailed info: %s", [original_price, price]) return except AttributeError: logging.error("Regex failed on %s", [original_price, price]) return if len(title) > 500 or len(title) < 10: logging.error("Title length too short or too long : %s", title) return if len(detail) < 20: logging.error("Detail too short. %s", detail) return # Save to db. try: site = Site.select(Site.q.url == self.home_url) assert(site.count() == 1), "%s not found or dups." % self.home_url title = utils.lstrip(title, [s.decode('utf8') for s in ('今日团购', '今日精选', ':')]) title = title.strip() title='[%s] %s' % (site[0].name, title) city_name = self.index_urls[url] city = City.select(City.q.name == city_name.decode('utf8')) assert city.count() == 1, "%s not found or dups." % city_name cityID = city[0].id if Deal.select(AND(Deal.q.title == title, Deal.q.cityID == cityID)).count() > 0: logging.info("Title dups %s" % title) return deal = Deal(url=url, title=title, price=price, originalPrice=original_price, detail=detail.decode('utf8'),cityID=cityID, siteID=site[0].id) logging.info('%s OK', url) except: # Simple handling for the moment. logging.error("Error occured while saving data : %s", sys.exc_info())
def get_sites(): '''\ Retrieves a list of Site objects or if there are none, raises a NoSitesImportedError. This error is designed to trigger the 500 error handler. ''' print "Debug trace: %s" % (settings.DATABASE_CONN_STR) sites = list(Site.select().orderBy('name')) if len(sites) == 0: raise NoSitesImportedError() return sites
def retrieve_sites(results): '''\ Retrieves the site objects associated with the results. ''' # get a list of all the site keys site_keys = set() for r in results: site_keys.add(r['siteKey']) # retrieve the site objects from the database site_objects = Site.select(IN(Site.q.key, list(site_keys))) # convert results into a dict with site key as the key sites = { } for s in site_objects: sites[s.key] = s # place site objects into the dict for r in results: site_key = r['siteKey'] r['site'] = sites[site_key]
def test_Count_Sites(self): for i in range(10): site = create_dummy_site() site.save() self.assertEqual(Site.select().count(), 10)
print 'Import aborted on user request.' sys.exit(3) # rollback any uncommitted entries in solr. Uncommitted entries may occur if # this import process is aborted. Solr doesn't have the concept of transactions # like databases do, so without a rollback, we'll be committing the previously # uncommitted entries plus the newly imported ones. # # This also means multiple dataproc processes cannot occur concurrently. If you # do the import will be silently incomplete. print('Clearing any uncommitted entries in solr...') solr._update('<rollback />', waitFlush=None, waitSearcher=None) print('Cleared.\n') # check if site is already in database; if so, purge the data. site = list(Site.select(Site.q.key==site_key)) if len(site) > 0: site = site[0] print('Deleting site "%s" from the database... ' % site.name) sys.stdout.flush() # Using SQLObject to delete rows takes too long, so we're going to do it directly #Site.delete(site.id) # the relationship cascades, so other rows will be deleted sqlhub.threadConnection = sqlhub.processConnection.transaction() conn = sqlhub.threadConnection # these deletions are done in this order to avoid FK constraint issues print('\tDeleting badges...') conn.query(conn.sqlrepr(Delete(Badge.sqlmeta.table, where=(Badge.q.site==site)))) print('\tDeleting users...') conn.query(conn.sqlrepr(Delete(User.sqlmeta.table, where=(User.q.site==site)))) print('\tDeleting site...') conn.query(conn.sqlrepr(Delete(Site.sqlmeta.table, where=(Site.q.id==site.id))))
def retrieve_users(results, question_only=False, ignore_comments=False): '''\ Retrieves the user objects associated with the question objects. ''' # get a list of all the user IDs user_ids_by_site = { } for r in results: site_key = r['siteKey'] if site_key not in user_ids_by_site.keys(): user_ids_by_site[site_key] = set() # the search result object itself for k in r.keys(): if k.lower().endswith('userid'): user_ids_by_site[site_key].add(r[k]) # the question object question = r['question'] for k in question.keys(): if k.lower().endswith('userid'): user_ids_by_site[site_key].add(question[k]) comments = question.get('comments') if not ignore_comments and comments: for c in comments: for ck in c.keys(): if ck.lower().endswith('userid'): user_ids_by_site[site_key].add(c[ck]) # the answers answers = r.get('answers') if not question_only and answers: for a in answers: for k in a.keys(): if k.lower().endswith('userid'): user_ids_by_site[site_key].add(a[k]) comments = a.get('comments') if not ignore_comments and comments: for c in comments: for ck in c.keys(): if ck.lower().endswith('userid'): user_ids_by_site[site_key].add(c[ck]) # retrieve the user objects from the database by site users_by_site = { } for site_key in user_ids_by_site.keys(): site = Site.select(Site.q.key == site_key).getOne() user_objects = User.select(AND(User.q.site == site, IN(User.q.sourceId, list(user_ids_by_site[site_key])) )) # convert results into a dict with user id as the key users = { } for u in user_objects: users[u.sourceId] = u users_by_site[site_key] = users # place user objects into the dict for r in results: site_key = r['siteKey'] # the search result object itself for k in r.keys(): if k.lower().endswith('userid'): # use the same field name, minus the 'Id' on the end. r[k[:-2]] = users_by_site[site_key].get(r[k]) # the question object question = r['question'] for k in question.keys(): if k.lower().endswith('userid'): # use the same field name, minus the 'Id' on the end. question[k[:-2]] = users_by_site[site_key].get(question[k]) comments = question.get('comments') if not ignore_comments and comments: for c in comments: for ck in c.keys(): if ck.lower().endswith('userid'): # use the same field name, minus the 'Id' on the end. c[ck[:-2]] = users_by_site[site_key].get(c[ck]) # the answers answers = r.get('answers') if not question_only and answers: for a in answers: for k in a.keys(): if k.lower().endswith('userid'): # use the same field name, minus the 'Id' on the end. a[k[:-2]] = users_by_site[site_key].get(a[k]) comments = a.get('comments') if not ignore_comments and comments: for c in comments: for ck in c.keys(): if ck.lower().endswith('userid'): # use the same field name, minus the 'Id' on the end. c[ck[:-2]] = users_by_site[site_key].get(c[ck])