Exemplo n.º 1
0
def update_dge_ga_visit_stats(period_name,
                              period_complete_day,
                              data,
                              print_progress=False):
    '''
    Given a list of sections and number of sessions for each during a given period,
    stores them in DgeGaVisit under the period.
    '''
    print "Updating dge_ga_visits..."
    progress_total = len(data)
    progress_count = 0
    if print_progress:
        progress_bar = GaProgressBar(progress_total)
    for key, key_value, sessions in data:
        progress_count += 1
        if print_progress:
            progress_bar.update(progress_count)
        values = {
            'year_month': period_name,
            'end_day': period_complete_day,
            'sessions': sessions,
            'key': key,
            'key_value': key_value
        }
        model.Session.add(DgeGaVisit(**values))
        model.Session.commit()
    print "... Updated dge_ga_visits"
Exemplo n.º 2
0
def post_update_url_stats(print_progress=False):

    """ Check the distinct url field in ga_url and make sure
        it has an All record.  If not then create one.

        After running this then every URL should have an All
        record regardless of whether the URL has an entry for
        the month being currently processed.
    """
    log.debug('Post-processing "All" records...')
    query = """select url, pageviews::int, visits::int
               from ga_url
               where url not in (select url from ga_url where period_name ='All')"""
    connection = model.Session.connection()
    res = connection.execute(query)

    views, visits = {}, {}
    # url, views, visits
    for row in res:
        views[row[0]] = views.get(row[0], 0) + row[1]
        visits[row[0]] = visits.get(row[0], 0) + row[2]

    progress_total = len(views.keys())
    progress_count = 0
    if print_progress:
        progress_bar = GaProgressBar(progress_total)
    identifier = Identifier()
    for key in views.keys():
        progress_count += 1
        if print_progress:
            progress_bar.update(progress_count)

        package, publisher = identifier.get_package_and_publisher(key)

        values = {'id': make_uuid(),
                  'period_name': "All",
                  'period_complete_day': 0,
                  'url': key,
                  'pageviews': views[key],
                  'visits': visits[key],
                  'department_id': publisher,
                  'package_id': package
                  }
        model.Session.add(GA_Url(**values))
    model.Session.commit()
    log.debug('..done')
def update_csc_ga_session_stats(period_name, period_complete_day, data,
                     print_progress=False):
    '''
    Given a list of sections and number of sessions for each during a given period,
    stores them in CscGaSession under the period.
    '''
    print "Updating csc_ga_sessions..."
    progress_total = len(data)
    progress_count = 0
    if print_progress:
        progress_bar = GaProgressBar(progress_total)
    for sessions in data:
        progress_count += 1
        if print_progress:
            progress_bar.update(progress_count)
        values = {
                  'year_month': period_name,
                  'end_day': period_complete_day,
                  'sessions': sessions
                 }
        model.Session.add(CscGaSession(**values))
        model.Session.commit()
    print "... Updated csc_ga_sessions"
Exemplo n.º 4
0
def update_url_stats(period_name, period_complete_day, url_data,
                     print_progress=False):
    '''
    Given a list of urls and number of hits for each during a given period,
    stores them in GA_Url under the period and recalculates the totals for
    the 'All' period.
    '''
    progress_total = len(url_data)
    progress_count = 0
    if print_progress:
        progress_bar = GaProgressBar(progress_total)
    urls_in_ga_url_this_period = set(
        result[0] for result in model.Session.query(GA_Url.url)
                                     .filter(GA_Url.period_name==period_name)
                                     .all())
    identifier = Identifier()
    for url, views, visits in url_data:
        progress_count += 1
        if print_progress:
            progress_bar.update(progress_count)

        package, publisher = identifier.get_package_and_publisher(url)

        if url in urls_in_ga_url_this_period:
            item = model.Session.query(GA_Url).\
                filter(GA_Url.period_name==period_name).\
                filter(GA_Url.url==url).first()
            item.pageviews = int(item.pageviews or 0) + int(views or 0)
            item.visits = int(item.visits or 0) + int(visits or 0)
            if not item.package_id:
                item.package_id = package
            if not item.department_id:
                item.department_id = publisher
            model.Session.add(item)
        else:
            values = {'id': make_uuid(),
                      'period_name': period_name,
                      'period_complete_day': period_complete_day,
                      'url': url,
                      'pageviews': views,
                      'visits': visits,
                      'department_id': publisher,
                      'package_id': package
                      }
            model.Session.add(GA_Url(**values))
            urls_in_ga_url_this_period.add(url)
        model.Session.commit()

        if package:
            counts = \
                model.Session.query(func.sum(cast(GA_Url.pageviews,
                                                  types.INTEGER)),
                                    func.sum(cast(GA_Url.visits,
                                                  types.INTEGER))
                                    ) \
                     .filter(GA_Url.period_name!='All') \
                     .filter(GA_Url.url==url) \
                     .all()
            pageviews, visits = counts[0]
            values = {'id': make_uuid(),
                      'period_name': 'All',
                      'period_complete_day': 0,
                      'url': url,
                      'pageviews': pageviews,
                      'visits': visits,
                      'department_id': publisher,
                      'package_id': package
                      }

            model.Session.add(GA_Url(**values))
            model.Session.commit()
def update_csc_ga_dataset_stats(period_name, period_complete_day, url_data, print_progress=False):
    '''
    Given a list of urls and number of hits for each during a given period,
    stores them in CscGaDataset under the period.
    '''
    print "Updating csc_ga_dataset..."
    progress_total = len(url_data)
    progress_count = 0
    if print_progress:
        progress_bar = GaProgressBar(progress_total)
    urls_in_csc_ga_dataset_this_period = set(
        result[0] for result in model.Session.query(CscGaDataset.dataset_name)
                                     .filter(CscGaDataset.year_month==period_name)
                                     .all())
    processed_urls = []
    #dict with key:<url> and value: (<dataset_name>, <org_id>, <pub_id>)
    processed_urls_dict = {} 
    orgs = {}
    for url, views in url_data:
        progress_count += 1
        if print_progress:
            progress_bar.update(progress_count)

        if url in urls_in_csc_ga_dataset_this_period:
            item = model.Session.query(CscGaDataset).\
                filter(CscGaDataset.year_month==period_name).\
                filter(CscGaDataset.dataset_name==url).first()
            item.pageviews = int(item.pageviews or 0) + int(views or 0)
            model.Session.add(item)
        else:
            print url
            dataset_name = None
            org_id = None
            dataset = model.Package.get(url)
            if dataset:
                dataset_name = dataset.name
                org_id = dataset.owner_org
            #Only if dataset not found, possible purged dataset, check previous stats
            if dataset_name is None:
                #get persisted data from other periods
                if url not in processed_urls:
                    dataset_name, org_id = _get_previous_csc_ga_dataset_stats(url)
                    processed_urls.append(url)
                    processed_urls_dict[url] = (dataset_name, org_id)
                else:
                    url_dict = processed_urls_dict.get(url, None)
                    if url_dict:
                        dataset_name = url_dict[0]
                        org_id = url_dict[1]

            if dataset_name :
                values = {
                        'year_month': period_name,
                        'end_day': period_complete_day,
                        'pageviews': views,
                        'dataset_name': dataset_name
                        }
                model.Session.add(CscGaDataset(**values))
                urls_in_csc_ga_dataset_this_period.add(url)
        model.Session.commit()
    print "...Updated csc_ga_dataset"
Exemplo n.º 6
0
def update_url_stats(period_name,
                     period_complete_day,
                     url_data,
                     print_progress=False):
    '''
    Given a list of urls and number of hits for each during a given period,
    stores them in GA_Url under the period and recalculates the totals for
    the 'All' period.
    '''
    progress_total = len(url_data)
    progress_count = 0
    if print_progress:
        progress_bar = GaProgressBar(progress_total)
    urls_in_ga_url_this_period = set(
        result[0] for result in model.Session.query(GA_Url.url).filter(
            GA_Url.period_name == period_name).all())
    identifier = Identifier()
    for url, views, visits in url_data:
        progress_count += 1
        if print_progress:
            progress_bar.update(progress_count)

        package, publisher = identifier.get_package_and_publisher(url)

        if url in urls_in_ga_url_this_period:
            item = model.Session.query(GA_Url).\
                filter(GA_Url.period_name==period_name).\
                filter(GA_Url.url==url).first()
            item.pageviews = int(item.pageviews or 0) + int(views or 0)
            item.visits = int(item.visits or 0) + int(visits or 0)
            if not item.package_id:
                item.package_id = package
            if not item.department_id:
                item.department_id = publisher
            model.Session.add(item)
        else:
            values = {
                'id': make_uuid(),
                'period_name': period_name,
                'period_complete_day': period_complete_day,
                'url': url,
                'pageviews': views,
                'visits': visits,
                'department_id': publisher,
                'package_id': package
            }
            model.Session.add(GA_Url(**values))
            urls_in_ga_url_this_period.add(url)
        model.Session.commit()

        if package:
            counts = \
                model.Session.query(func.sum(cast(GA_Url.pageviews,
                                                  types.INTEGER)),
                                    func.sum(cast(GA_Url.visits,
                                                  types.INTEGER))
                                    ) \
                     .filter(GA_Url.period_name!='All') \
                     .filter(GA_Url.url==url) \
                     .all()
            pageviews, visits = counts[0]
            values = {
                'id': make_uuid(),
                'period_name': 'All',
                'period_complete_day': 0,
                'url': url,
                'pageviews': pageviews,
                'visits': visits,
                'department_id': publisher,
                'package_id': package
            }

            model.Session.add(GA_Url(**values))
            model.Session.commit()
Exemplo n.º 7
0
def update_dge_ga_package_stats(period_name,
                                period_complete_day,
                                url_data,
                                print_progress=False):
    '''
    Given a list of urls and number of hits for each during a given period,
    stores them in DgeGaPackage under the period.
    '''
    print "Updating dge_ga_package..."
    progress_total = len(url_data)
    progress_count = 0
    if print_progress:
        progress_bar = GaProgressBar(progress_total)
    urls_in_dge_ga_package_this_period = set(
        result[0] for result in model.Session.query(DgeGaPackage.url).filter(
            DgeGaPackage.year_month == period_name).all())
    processed_urls = []
    #dict with key:<url> and value: (<package_name>, <org_id>, <pub_id>)
    processed_urls_dict = {}

    identifier = Identifier()
    for url, views in url_data:
        progress_count += 1
        if print_progress:
            progress_bar.update(progress_count)

        if url in urls_in_dge_ga_package_this_period:
            item = model.Session.query(DgeGaPackage).\
                filter(DgeGaPackage.year_month==period_name).\
                filter(DgeGaPackage.url==url).first()
            item.pageviews = int(item.pageviews or 0) + int(views or 0)
            model.Session.add(item)
        else:
            pack_name, org_id, pub_id = identifier.get_package_information(url)

            #Only if package not found, possible purged dataset, check previous stats
            if pack_name is None:
                #get persisted data from other periods
                if url not in processed_urls:
                    pack_name, org_id, pub_id = _get_previous_dge_ga_package_stats(
                        url)
                    processed_urls.append(url)
                    processed_urls_dict[url] = (pack_name, org_id, pub_id)
                else:
                    url_dict = processed_urls_dict.get(url, None)
                    if url_dict:
                        pack_name = url_dict[0]
                        org_id = url_dict[1]
                        pub_id = url_dict[2]

            if pack_name is None:
                pack_name = u''
            values = {
                'year_month': period_name,
                'end_day': period_complete_day,
                'url': url,
                'pageviews': views,
                'package_name': pack_name,
                'organization_id': org_id,
                'publisher_id': pub_id
            }
            model.Session.add(DgeGaPackage(**values))
            urls_in_dge_ga_package_this_period.add(url)
        model.Session.commit()
    print "...Updated dge_ga_package"