Exemplo n.º 1
0
def main(dry_run=True, batch_count=None, force=False):
    """Upload the pageviews to Keen.
    """

    history_run_id = utils.get_history_run_id_for('transform02')
    complaints_run_id = utils.get_complaints_run_id_for('transform02')
    if history_run_id != complaints_run_id:
        print("You need to validate your first-phase transformed data! Bailing...")
        sys.exit()

    extract_complaints = utils.get_complaints_for('transform02', 'r')
    extract_complaints.readline()  # toss header
    if extract_complaints.readline():
        print("You have unaddressed complaints in your second-phase transform!")
        if not force:
            print("  ...pass --force to ignore")
            sys.exit()


    history_file = utils.get_history_for('load', 'a')
    history_file.write(script_settings.RUN_HEADER + '{}\n'.format(complaints_run_id))
    history_file.write('Beginning upload at: {}Z\n'.format(datetime.utcnow()))

    keen_clients = {'public': None, 'private': None}
    es_client = None
    if dry_run:
        print("Doing dry-run upload to Elastic search.  Pass --for-reals to upload to Keen")
        es_client = Elasticsearch()
        try:
            es_client.indices.delete(script_settings.ES_INDEX)
        except Exception as exc:
            print(exc)
            pass
    else:
        keen_clients = {
            'public': KeenClient(
                project_id=settings.KEEN['public']['project_id'],
                write_key=settings.KEEN['public']['write_key'],
            ),
            'private':  KeenClient(
                project_id=settings.KEEN['private']['project_id'],
                write_key=settings.KEEN['private']['write_key'],
            )
        }

    tally = {}
    seen = {}

    try:
        with open(utils.get_dir_for('load') + '/resume.log', 'r') as resume_log:
            for seen_file in resume_log.readlines():
                seen[seen_file.strip('\n')] = 1
    except:
        pass

    batch_count = utils.get_batch_count() if batch_count is None else batch_count
    print("Beginning Upload")
    with open(utils.get_dir_for('load') + '/resume.log', 'a', 0) as resume_log:
        for batch_id in range(1, batch_count+1):
            print("  Batch {}".format(batch_id))
            for domain in ('private', 'public'):
                print("    Domain: {}".format(domain))
                file_id = '{}-{}'.format(domain, batch_id)
                if file_id in seen.keys():
                    print("  ...seen, skipping.\n")
                    continue

                history_file.write('Uploading for {} project, batch {}'.format(domain, batch_id))
                load_batch_for(batch_id, domain, tally, dry_run, es_client, keen_clients[domain])
                resume_log.write('{}\n'.format(file_id))
                history_file.write('  ...finished\n')

    print("Finished Upload")
    history_file.write('Finished upload at: {}Z\n'.format(datetime.utcnow()))
    history_file.write('Tally was:\n')
    for k, v in sorted(tally.items()):
        history_file.write('  {}: {}\n'.format(k, v))
Exemplo n.º 2
0
def main():
    """This script extracts pageview data from the OSF Piwik db and outputs the results
    to a dumpfile.
    """

    try:
        mysql_db = MySQLdb.connect(host=settings.PIWIK_DB_HOST,
                                   port=settings.PIWIK_DB_PORT,
                                   user=settings.PIWIK_DB_USER,
                                   passwd=settings.PIWIK_DB_PASSWORD,
                                   db=settings.PIWIK_DB_NAME)
    except MySQLdb.Error as err:
        print "MySQL Error [%d]: %s" % (err.args[0], err.args[1])
        raise err

    my_cursor = mysql_db.cursor(MySQLdb.cursors.DictCursor)
    my_cursor.execute("SET NAMES 'utf8'")
    my_cursor.execute(
        "SET sql_mode = 'ERROR_FOR_DIVISION_BY_ZERO,NO_AUTO_CREATE_USER,NO_AUTO_VALUE_ON_ZERO,"
        "NO_ZERO_DATE,NO_ZERO_IN_DATE';")

    history_file = utils.get_history_for('extract', 'w')
    output_file = open(
        utils.get_dir_for('extract') + '/' + settings.EXTRACT_FILE, 'w')

    count = 0
    last_count = 0

    history_file.write(settings.RUN_HEADER + '{}\n'.format(uuid.uuid4()))
    history_file.write('Beginning extraction at: {}Z\n'.format(
        datetime.utcnow()))
    visit_cursor = get_visits(mysql_db)
    visit = visit_cursor.fetchone()
    while visit is not None:
        visit['tz_offset'] = calculate_tz_offset(
            str(visit['visitor_localtime']),
            str(visit['first_contact_server_time']))

        action_cursor = get_actions_for_visit(mysql_db, visit['idvisit'])
        action = action_cursor.fetchone()

        action_count = 0
        while action is not None:
            action_count += 1
            referrer_url = None
            if action_count == 1 and visit['referer_type'] in (
                    2,
                    3,
            ):
                referrer_url = visit['referer_url']
            elif action['previous_url']:
                url_scheme = '' if action[
                    'previous_url_prefix'] is None else URL_PREFIX[
                        action['previous_url_prefix']]
                referrer_url = url_scheme + action['previous_url']

            # piwik stores searches weird.
            if action['page_title_type'] and action['page_title_type'] == 8:
                action['page_url'] = 'staging.osf.io/search/?q=' + action[
                    'page_title']
                action['page_url_prefix'] = 2
                action['page_title'] = 'OSF | Search'

            pageview = {
                'visit': {
                    'id':
                    visit['idvisit'],
                    'visitor_id':
                    b2a_hex(visit['idvisitor']),
                    'visitor_returning':
                    visit['visitor_returning'],
                    'ip_addr':
                    None if visit['location_ip'] == NULL_IP else inet_ntoa(
                        visit['location_ip']),
                    'user_id':
                    visit['user_id'],
                    'tz_offset':
                    visit['tz_offset'],
                    'ua': {
                        'os': visit['config_os'],
                        'os_version': None,
                        'browser': {
                            'version': visit['config_browser_version'],
                            'name': visit['config_browser_name'],
                            'cookies': visit['config_cookie'],
                            'locale': visit['location_browser_lang'],
                        },
                        'screen': visit['config_resolution'],
                        'device': None,
                    },
                },
                'action': {
                    'id': action['visit_action_id'],
                    'parent_node_id': action['parent_node_id'],
                    'node_id': action['node_id'],
                    'node_tags': action['node_tags'],
                    'page': {
                        'url':
                        action['page_url'],
                        'url_prefix':
                        None if action['page_url_prefix'] is None else
                        URL_PREFIX[action['page_url_prefix']],
                        'url_id':
                        action['page_url_id'],
                        'url_type':
                        action['page_url_type'],
                        'title':
                        action['page_title'],
                        'title_id':
                        action['page_title_id'],
                        'title_type':
                        action['page_title_type'],
                        'is_search':
                        True if action['page_title_type']
                        and action['page_title_type'] == 8 else False,
                    },
                    'referrer': referrer_url,
                    'timestamp': str(action['server_time']),
                },
            }

            output_file.write(json.dumps(pageview) + '\n')
            history_file.write('\tLast action written timestamp: ' +
                               str(action['server_time']) + '\n')
            count += 1
            action = action_cursor.fetchone()

        visit = visit_cursor.fetchone()

    history_file.write('Finished extraction at: {}Z\n'.format(
        datetime.utcnow()))
    history_file.write('Final count was: {}\n'.format(count))
    print("Final count is: {}".format(count))
    history_file.close()
    output_file.close()

    mysql_db.close()
Exemplo n.º 3
0
def main(force=False):

    history_run_id = utils.get_history_run_id_for('transform01')
    complaints_run_id = utils.get_complaints_run_id_for('transform01')
    if history_run_id != complaints_run_id:
        print("You need to validate your first-phase transformed data! Bailing...")
        sys.exit()

    extract_complaints = utils.get_complaints_for('transform01', 'r')
    extract_complaints.readline()  # toss header
    if extract_complaints.readline():
        print("You have unaddressed complaints in your first-phase transform!")
        if not force:
            print("  ...pass --force to ignore")
            sys.exit()


    history_file = utils.get_history_for('transform02', 'w')
    history_file.write('Run ID: {}\n'.format(complaints_run_id))
    history_file.write('Beginning extraction at: {}Z\n'.format(datetime.utcnow()))

    transform_dir = utils.get_dir_for('transform02')
    public_template = transform_dir + '/public-{0:04d}.data'
    private_template = transform_dir + '/private-{0:04d}.data'

    lastline = 0
    try:
        with open(utils.get_dir_for('transform02') + '/resume.log', 'r') as fp:
            fp.seek(-32, 2)
            lastline = int(fp.readlines()[-1].strip('\n'))
    except IOError:
        pass


    linenum = 0
    batchnum = 0
    public_pageviews = []
    private_pageviews = []

    with open(transform_dir + '/resume.log', 'a', 0) as resume_file:  # Pass 0 for unbuffered writing
        with open(utils.get_dir_for('transform01') + '/' + settings.TRANSFORM01_FILE, 'r') as input_file:
            print('Lastline is: {}\n'.format(lastline))
            for i, pageview_json in enumerate(input_file):
                linenum = i + 1
                if linenum <= lastline:
                    if not linenum % 1000:
                        print('Skipping line {} of ***{}***'.format(linenum, lastline))
                    continue

                if not linenum % 1000:
                    print('Batching line {}'.format(linenum))

                pageview = json.loads(pageview_json)
                made_public_date = pageview['node']['made_public_date']
                del pageview['node']['made_public_date']

                private_pageviews.append(pageview)

                # only pageviews logged after the most recent make public date are copied to public
                # collection
                if made_public_date is not None and made_public_date < pageview['keen']['timestamp']:
                    public_pageview = copy.deepcopy(pageview)

                    for private_property in ('tech', 'user', 'visitor', 'geo' ):
                        del public_pageview[private_property]

                    for addon in public_pageview['keen']['addons']:
                        if addon['name'] in ('keen:ip_to_geo', 'keen:ua_parser'):
                            public_pageview['keen']['addons'].remove(addon)

                    public_pageviews.append(public_pageview)

                if linenum % settings.BATCH_SIZE == 0:
                    batchnum += 1
                    write_batch(batchnum, complaints_run_id, 'public', public_pageviews, transform_dir)
                    write_batch(batchnum, complaints_run_id, 'private', private_pageviews, transform_dir)
        
        if linenum % settings.BATCH_SIZE != 0:
            batchnum += 1
            write_batch(batchnum, complaints_run_id, 'public', public_pageviews, transform_dir)
            write_batch(batchnum, complaints_run_id, 'private', private_pageviews, transform_dir)

    history_file.write(settings.BATCH_HEADER + '{}\n'.format(batchnum))
Exemplo n.º 4
0
def main(dry_run=True, batch_count=None, force=False):
    """Upload the pageviews to Keen.
    """

    history_run_id = utils.get_history_run_id_for('transform02')
    complaints_run_id = utils.get_complaints_run_id_for('transform02')
    if history_run_id != complaints_run_id:
        print(
            "You need to validate your first-phase transformed data! Bailing..."
        )
        sys.exit()

    extract_complaints = utils.get_complaints_for('transform02', 'r')
    extract_complaints.readline()  # toss header
    if extract_complaints.readline():
        print(
            "You have unaddressed complaints in your second-phase transform!")
        if not force:
            print("  ...pass --force to ignore")
            sys.exit()

    history_file = utils.get_history_for('load', 'a')
    history_file.write(script_settings.RUN_HEADER +
                       '{}\n'.format(complaints_run_id))
    history_file.write('Beginning upload at: {}Z\n'.format(datetime.utcnow()))

    keen_clients = {'public': None, 'private': None}
    es_client = None
    if dry_run:
        print(
            "Doing dry-run upload to Elastic search.  Pass --for-reals to upload to Keen"
        )
        es_client = Elasticsearch()
        try:
            es_client.indices.delete(script_settings.ES_INDEX)
        except Exception as exc:
            print(exc)
            pass
    else:
        keen_clients = {
            'public':
            KeenClient(
                project_id=settings.KEEN['public']['project_id'],
                write_key=settings.KEEN['public']['write_key'],
            ),
            'private':
            KeenClient(
                project_id=settings.KEEN['private']['project_id'],
                write_key=settings.KEEN['private']['write_key'],
            )
        }

    tally = {}
    seen = {}

    try:
        with open(utils.get_dir_for('load') + '/resume.log',
                  'r') as resume_log:
            for seen_file in resume_log.readlines():
                seen[seen_file.strip('\n')] = 1
    except:
        pass

    batch_count = utils.get_batch_count(
    ) if batch_count is None else batch_count
    print("Beginning Upload")
    with open(utils.get_dir_for('load') + '/resume.log', 'a', 0) as resume_log:
        for batch_id in range(1, batch_count + 1):
            print("  Batch {}".format(batch_id))
            for domain in ('private', 'public'):
                print("    Domain: {}".format(domain))
                file_id = '{}-{}'.format(domain, batch_id)
                if file_id in seen.keys():
                    print("  ...seen, skipping.\n")
                    continue

                history_file.write('Uploading for {} project, batch {}'.format(
                    domain, batch_id))
                load_batch_for(batch_id, domain, tally, dry_run, es_client,
                               keen_clients[domain])
                resume_log.write('{}\n'.format(file_id))
                history_file.write('  ...finished\n')

    print("Finished Upload")
    history_file.write('Finished upload at: {}Z\n'.format(datetime.utcnow()))
    history_file.write('Tally was:\n')
    for k, v in sorted(tally.items()):
        history_file.write('  {}: {}\n'.format(k, v))
Exemplo n.º 5
0
def main(force=False):

    history_run_id = utils.get_history_run_id_for('transform01')
    complaints_run_id = utils.get_complaints_run_id_for('transform01')
    if history_run_id != complaints_run_id:
        print(
            "You need to validate your first-phase transformed data! Bailing..."
        )
        sys.exit()

    extract_complaints = utils.get_complaints_for('transform01', 'r')
    extract_complaints.readline()  # toss header
    if extract_complaints.readline():
        print("You have unaddressed complaints in your first-phase transform!")
        if not force:
            print("  ...pass --force to ignore")
            sys.exit()

    history_file = utils.get_history_for('transform02', 'w')
    history_file.write('Run ID: {}\n'.format(complaints_run_id))
    history_file.write('Beginning extraction at: {}Z\n'.format(
        datetime.utcnow()))

    transform_dir = utils.get_dir_for('transform02')
    public_template = transform_dir + '/public-{0:04d}.data'
    private_template = transform_dir + '/private-{0:04d}.data'

    lastline = 0
    try:
        with open(utils.get_dir_for('transform02') + '/resume.log', 'r') as fp:
            fp.seek(-32, 2)
            lastline = int(fp.readlines()[-1].strip('\n'))
    except IOError:
        pass

    linenum = 0
    batchnum = 0
    public_pageviews = []
    private_pageviews = []

    with open(transform_dir + '/resume.log', 'a',
              0) as resume_file:  # Pass 0 for unbuffered writing
        with open(
                utils.get_dir_for('transform01') + '/' +
                settings.TRANSFORM01_FILE, 'r') as input_file:
            print('Lastline is: {}\n'.format(lastline))
            for i, pageview_json in enumerate(input_file):
                linenum = i + 1
                if linenum <= lastline:
                    if not linenum % 1000:
                        print('Skipping line {} of ***{}***'.format(
                            linenum, lastline))
                    continue

                if not linenum % 1000:
                    print('Batching line {}'.format(linenum))

                pageview = json.loads(pageview_json)
                made_public_date = pageview['node']['made_public_date']
                del pageview['node']['made_public_date']

                private_pageviews.append(pageview)

                # only pageviews logged after the most recent make public date are copied to public
                # collection
                if made_public_date is not None and made_public_date < pageview[
                        'keen']['timestamp']:
                    public_pageview = copy.deepcopy(pageview)

                    for private_property in ('tech', 'user', 'visitor', 'geo'):
                        del public_pageview[private_property]

                    for addon in public_pageview['keen']['addons']:
                        if addon['name'] in ('keen:ip_to_geo',
                                             'keen:ua_parser'):
                            public_pageview['keen']['addons'].remove(addon)

                    public_pageviews.append(public_pageview)

                if linenum % settings.BATCH_SIZE == 0:
                    batchnum += 1
                    write_batch(batchnum, complaints_run_id, 'public',
                                public_pageviews, transform_dir)
                    write_batch(batchnum, complaints_run_id, 'private',
                                private_pageviews, transform_dir)

        if linenum % settings.BATCH_SIZE != 0:
            batchnum += 1
            write_batch(batchnum, complaints_run_id, 'public',
                        public_pageviews, transform_dir)
            write_batch(batchnum, complaints_run_id, 'private',
                        private_pageviews, transform_dir)

    history_file.write(settings.BATCH_HEADER + '{}\n'.format(batchnum))
Exemplo n.º 6
0
def main():
    """This script extracts pageview data from the OSF Piwik db and outputs the results
    to a dumpfile.
    """

    try:
        mysql_db = MySQLdb.connect(
            host=settings.PIWIK_DB_HOST,
            port=settings.PIWIK_DB_PORT,
            user=settings.PIWIK_DB_USER,
            passwd=settings.PIWIK_DB_PASSWORD,
            db=settings.PIWIK_DB_NAME,
        )
    except MySQLdb.Error as err:
        print "MySQL Error [%d]: %s" % (err.args[0], err.args[1])
        raise err

    my_cursor = mysql_db.cursor(MySQLdb.cursors.DictCursor)
    my_cursor.execute("SET NAMES 'utf8'")
    my_cursor.execute(
        "SET sql_mode = 'ERROR_FOR_DIVISION_BY_ZERO,NO_AUTO_CREATE_USER,NO_AUTO_VALUE_ON_ZERO,"
        "NO_ZERO_DATE,NO_ZERO_IN_DATE';"
    )

    history_file = utils.get_history_for("extract", "w")
    output_file = open(utils.get_dir_for("extract") + "/" + settings.EXTRACT_FILE, "w")

    count = 0
    last_count = 0

    history_file.write(settings.RUN_HEADER + "{}\n".format(uuid.uuid4()))
    history_file.write("Beginning extraction at: {}Z\n".format(datetime.utcnow()))
    visit_cursor = get_visits(mysql_db)
    visit = visit_cursor.fetchone()
    while visit is not None:
        visit["tz_offset"] = calculate_tz_offset(
            str(visit["visitor_localtime"]), str(visit["first_contact_server_time"])
        )

        action_cursor = get_actions_for_visit(mysql_db, visit["idvisit"])
        action = action_cursor.fetchone()

        action_count = 0
        while action is not None:
            action_count += 1
            referrer_url = None
            if action_count == 1 and visit["referer_type"] in (2, 3):
                referrer_url = visit["referer_url"]
            elif action["previous_url"]:
                url_scheme = "" if action["previous_url_prefix"] is None else URL_PREFIX[action["previous_url_prefix"]]
                referrer_url = url_scheme + action["previous_url"]

            # piwik stores searches weird.
            if action["page_title_type"] and action["page_title_type"] == 8:
                action["page_url"] = "staging.osf.io/search/?q=" + action["page_title"]
                action["page_url_prefix"] = 2
                action["page_title"] = "OSF | Search"

            pageview = {
                "visit": {
                    "id": visit["idvisit"],
                    "visitor_id": b2a_hex(visit["idvisitor"]),
                    "visitor_returning": visit["visitor_returning"],
                    "ip_addr": None if visit["location_ip"] == NULL_IP else inet_ntoa(visit["location_ip"]),
                    "user_id": visit["user_id"],
                    "tz_offset": visit["tz_offset"],
                    "ua": {
                        "os": visit["config_os"],
                        "os_version": None,
                        "browser": {
                            "version": visit["config_browser_version"],
                            "name": visit["config_browser_name"],
                            "cookies": visit["config_cookie"],
                            "locale": visit["location_browser_lang"],
                        },
                        "screen": visit["config_resolution"],
                        "device": None,
                    },
                },
                "action": {
                    "id": action["visit_action_id"],
                    "parent_node_id": action["parent_node_id"],
                    "node_id": action["node_id"],
                    "node_tags": action["node_tags"],
                    "page": {
                        "url": action["page_url"],
                        "url_prefix": None
                        if action["page_url_prefix"] is None
                        else URL_PREFIX[action["page_url_prefix"]],
                        "url_id": action["page_url_id"],
                        "url_type": action["page_url_type"],
                        "title": action["page_title"],
                        "title_id": action["page_title_id"],
                        "title_type": action["page_title_type"],
                        "is_search": True if action["page_title_type"] and action["page_title_type"] == 8 else False,
                    },
                    "referrer": referrer_url,
                    "timestamp": str(action["server_time"]),
                },
            }

            output_file.write(json.dumps(pageview) + "\n")
            history_file.write("\tLast action written timestamp: " + str(action["server_time"]) + "\n")
            count += 1
            action = action_cursor.fetchone()

        visit = visit_cursor.fetchone()

    history_file.write("Finished extraction at: {}Z\n".format(datetime.utcnow()))
    history_file.write("Final count was: {}\n".format(count))
    print ("Final count is: {}".format(count))
    history_file.close()
    output_file.close()

    mysql_db.close()