Exemplo n.º 1
0
def rewrite_result(result):
    '''\
    Rewrites the HTML in this result (question, answers and comments) so
    links to other StackExchange sites that exist in Stackdump are rewritten,
    links elsewhere are decorated with a CSS class, and all images are replaced
    with a placeholder.
    
    The JSON must have been decoded first.
    '''
    app_url_root = settings.APP_URL_ROOT
    
    # get a list of all the site base URLs
    sites = list(Site.select())
    sites_by_urls = dict([ (s.base_url, s) for s in sites ])
    
    # rewrite question
    question = result.get('question')
    if question:
        question['body'] = _rewrite_html(question.get('body'), app_url_root, sites_by_urls)
        for c in question.get('comments', [ ]):
            c['text'] = _rewrite_html(c.get('text'), app_url_root, sites_by_urls)
    
    # rewrite answers
    answers = result.get('answers')
    if answers:
        for a in answers:
            a['body'] = _rewrite_html(a.get('body'), app_url_root, sites_by_urls)
            for c in a.get('comments', [ ]):
                c['text'] = _rewrite_html(c.get('text'), app_url_root, sites_by_urls)
Exemplo n.º 2
0
    def save_to_db(self, dic):
        assert all(map(dic.has_key, ['title', 'original_price', 'price', 'detail', 'url'])),\
            "Information incomplete."
        
        url = dic['url']
        original_price = dic['original_price'].text.encode('utf8')
        price = dic['price'].text.encode('utf8')
        title = dic['title'].text # title is unicode
        detail = dic['detail'].renderContents(encoding='utf8')
        detail = utils.clean_detail(detail, self.home_url)
            
        # Data formatting & validation.
        try:
            original_price, price = map(lambda s: int(re.search(r'(\d+)', s).group()),
                                        [original_price, price])
        except TypeError:
            logging.error("Price conversion failed. Detailed info: %s", [original_price, price])
            return
        except AttributeError:
            logging.error("Regex failed on %s", [original_price, price])
            return
        
        if len(title) > 500 or len(title) < 10:
            logging.error("Title length too short or too long : %s", title)
            return
        
        if len(detail) < 20:
            logging.error("Detail too short. %s", detail)
            return

        # Save to db.
        try:
            site = Site.select(Site.q.url == self.home_url)
            assert(site.count() == 1), "%s not found or dups." % self.home_url
            
            title = utils.lstrip(title, [s.decode('utf8') for s in ('今日团购', '今日精选', ':')])
            title = title.strip()
            title='[%s] %s' % (site[0].name, title)
            
            city_name = self.index_urls[url]
            city = City.select(City.q.name == city_name.decode('utf8'))
            assert city.count() == 1, "%s not found or dups." % city_name
            cityID = city[0].id
            
            if Deal.select(AND(Deal.q.title == title, Deal.q.cityID == cityID)).count() > 0:
                logging.info("Title dups %s" % title)
                return
            deal = Deal(url=url, title=title, price=price, originalPrice=original_price,
                        detail=detail.decode('utf8'),cityID=cityID, siteID=site[0].id)
            logging.info('%s OK', url)
        except:
            # Simple handling for the moment.
            logging.error("Error occured while saving data : %s", sys.exc_info())
Exemplo n.º 3
0
def get_sites():
    '''\
    Retrieves a list of Site objects or if there are none, raises a
    NoSitesImportedError. This error is designed to trigger the 500 error
    handler.
    '''
    print "Debug trace: %s" % (settings.DATABASE_CONN_STR)
    sites = list(Site.select().orderBy('name'))
    if len(sites) == 0:
        raise NoSitesImportedError()
    
    return sites
Exemplo n.º 4
0
def retrieve_sites(results):
    '''\
    Retrieves the site objects associated with the results.
    '''
    # get a list of all the site keys
    site_keys = set()
    for r in results:
        site_keys.add(r['siteKey'])
    
    # retrieve the site objects from the database
    site_objects = Site.select(IN(Site.q.key, list(site_keys)))
    
    # convert results into a dict with site key as the key
    sites = { }
    for s in site_objects:
        sites[s.key] = s
    
    # place site objects into the dict
    for r in results:
        site_key = r['siteKey']
        r['site'] = sites[site_key]
Exemplo n.º 5
0
 def test_Count_Sites(self):
   for i in range(10):
     site = create_dummy_site()
     site.save()
   self.assertEqual(Site.select().count(), 10)
Exemplo n.º 6
0
            print 'Import aborted on user request.'
            sys.exit(3)

    # rollback any uncommitted entries in solr. Uncommitted entries may occur if
    # this import process is aborted. Solr doesn't have the concept of transactions
    # like databases do, so without a rollback, we'll be committing the previously
    # uncommitted entries plus the newly imported ones.
    #
    # This also means multiple dataproc processes cannot occur concurrently. If you
    # do the import will be silently incomplete.
    print('Clearing any uncommitted entries in solr...')
    solr._update('<rollback />', waitFlush=None, waitSearcher=None)
    print('Cleared.\n')

    # check if site is already in database; if so, purge the data.
    site = list(Site.select(Site.q.key==site_key))
    if len(site) > 0:
        site = site[0]
        print('Deleting site "%s" from the database... ' % site.name)
        sys.stdout.flush()
        # Using SQLObject to delete rows takes too long, so we're going to do it directly
        #Site.delete(site.id) # the relationship cascades, so other rows will be deleted
        sqlhub.threadConnection = sqlhub.processConnection.transaction()
        conn = sqlhub.threadConnection
        # these deletions are done in this order to avoid FK constraint issues
        print('\tDeleting badges...')
        conn.query(conn.sqlrepr(Delete(Badge.sqlmeta.table, where=(Badge.q.site==site))))
        print('\tDeleting users...')
        conn.query(conn.sqlrepr(Delete(User.sqlmeta.table, where=(User.q.site==site))))
        print('\tDeleting site...')
        conn.query(conn.sqlrepr(Delete(Site.sqlmeta.table, where=(Site.q.id==site.id))))
Exemplo n.º 7
0
def retrieve_users(results, question_only=False, ignore_comments=False):
    '''\
    Retrieves the user objects associated with the question objects.
    '''
    # get a list of all the user IDs
    user_ids_by_site = { }
    for r in results:
        site_key = r['siteKey']
        if site_key not in user_ids_by_site.keys():
            user_ids_by_site[site_key] = set()
        
        # the search result object itself
        for k in r.keys():
            if k.lower().endswith('userid'):
                user_ids_by_site[site_key].add(r[k])
        
        # the question object
        question = r['question']
        for k in question.keys():
            if k.lower().endswith('userid'):
                user_ids_by_site[site_key].add(question[k])
            
            comments = question.get('comments')
            if not ignore_comments and comments:
                for c in comments:
                    for ck in c.keys():
                        if ck.lower().endswith('userid'):
                            user_ids_by_site[site_key].add(c[ck])
        
        # the answers
        answers = r.get('answers')
        if not question_only and answers:
            for a in answers:
                for k in a.keys():
                    if k.lower().endswith('userid'):
                        user_ids_by_site[site_key].add(a[k])
                
                comments = a.get('comments')
                if not ignore_comments and comments:
                    for c in comments:
                        for ck in c.keys():
                            if ck.lower().endswith('userid'):
                                user_ids_by_site[site_key].add(c[ck])
    
    # retrieve the user objects from the database by site
    users_by_site = { }
    for site_key in user_ids_by_site.keys():
        site = Site.select(Site.q.key == site_key).getOne()
        user_objects = User.select(AND(User.q.site == site,
                                       IN(User.q.sourceId, list(user_ids_by_site[site_key]))
                                  ))
        
        # convert results into a dict with user id as the key
        users = { }
        for u in user_objects:
            users[u.sourceId] = u
        
        users_by_site[site_key] = users
    
    # place user objects into the dict
    for r in results:
        site_key = r['siteKey']
        
        # the search result object itself
        for k in r.keys():
            if k.lower().endswith('userid'):
                # use the same field name, minus the 'Id' on the end.
                r[k[:-2]] = users_by_site[site_key].get(r[k])
        
        # the question object
        question = r['question']
        for k in question.keys():
            if k.lower().endswith('userid'):
                # use the same field name, minus the 'Id' on the end.
                question[k[:-2]] = users_by_site[site_key].get(question[k])
            
        comments = question.get('comments')
        if not ignore_comments and comments:
            for c in comments:
                for ck in c.keys():
                    if ck.lower().endswith('userid'):
                        # use the same field name, minus the 'Id' on the end.
                        c[ck[:-2]] = users_by_site[site_key].get(c[ck])
        
        # the answers
        answers = r.get('answers')
        if not question_only and answers:
            for a in answers:
                for k in a.keys():
                    if k.lower().endswith('userid'):
                        # use the same field name, minus the 'Id' on the end.
                        a[k[:-2]] = users_by_site[site_key].get(a[k])
                
                comments = a.get('comments')
                if not ignore_comments and comments:
                    for c in comments:
                        for ck in c.keys():
                            if ck.lower().endswith('userid'):
                                # use the same field name, minus the 'Id' on the end.
                                c[ck[:-2]] = users_by_site[site_key].get(c[ck])