def verify_files(domain, batch_count, run_id, complaints_file):
    complaints = 0
    work_dir = utils.get_dir_for('transform02')
    files = glob.glob(work_dir + '/' + domain + '-*.data')
    if batch_count > len(files):
        complaints += 1
        complaints_file.write('Too many {} files found! got {}, expected {}\n'.format(
            domain, len(files), batch_count,
        ))
    elif batch_count < len(files):
        complaints += 1
        complaints_file.write('Too few {} files found! got {}, expected {}\n'.format(
            domain, len(files), batch_count,
        ))

    lastfile_re = domain + '\-\d*' + str(batch_count) + '\.data'
    for filename in files:
        data_file = open(filename, 'r')
        file_run_id = data_file.readline().replace(settings.RUN_HEADER, '').rstrip()
        if file_run_id != run_id:
            complaints += 1
            complaints_file.write('Invalid Run ID for {}! got {}, expected {}\n'.format(
                filename, file_run_id, run_id,
            ))
            break

        events = json.loads(data_file.readline())
        if len(events) != settings.BATCH_SIZE and not re.search(lastfile_re, filename):
            complaints += 1
            complaints_file.write('Not enough events for {}! got {}, expected {}\n'.format(
                filename, len(events), settings.BATCH_SIZE,
            ))

        if domain == 'public':
            eventnum = 0
            for event in events:
                eventnum += 1
                if hasattr(event, 'tech'):
                    complaints += 1
                    complaints_file.write(
                        'Event {} in {} has private data! "tech" shouldn\'t be included\n'.format(
                            eventnum, filename,
                        )
                    )
                if hasattr(event, 'user'):
                    complaints += 1
                    complaints_file.write(
                        'Event {} in {} has private data! "user" shouldn\'t be included\n'.format(
                            eventnum, filename,
                        )
                    )

    return complaints
예제 #2
0
def main():

    input_filename = '/'.join([
        utils.get_dir_for('extract'),
        settings.EXTRACT_FILE,
    ])
    input_file = open(input_filename, 'r')

    run_id = utils.get_history_run_id_for('extract')
    complaints_file = utils.get_complaints_for('extract', 'w')
    complaints_file.write('Run ID: {}\n'.format(run_id))

    linenum = 0
    complaints = 0
    for pageview_json in input_file.readlines():
        linenum += 1
        pageview = json.loads(pageview_json)

        visit = pageview['visit']
        action = pageview['action']

        # ip address are all scrubbed?
        if not re.search('0\.0$', visit['ip_addr']):
            complaints += 1
            complaints_file.write(
                'Line {}, ID {}: unscrubbed ip address! ({})\n'.format(
                    linenum, action['id'], visit['ip_addr']))

        if not action['page']['url']:
            complaints += 1
            complaints_file.write(
                'Line {}, ID {}: page url is missing!\n'.format(
                    linenum, action['id']))
        elif re.match('https?:\/\/', action['page']['url']):
            complaints += 1
            complaints_file.write(
                'Line {}, ID {}: page url includes domain! ({})\n'.format(
                    linenum, action['id'],
                    action['page']['url'].encode('utf-8')))

    if complaints > 0:
        print("You've got {} problems, but a ready-to-go migration ain't one!".
              format(complaints))
    else:
        print("Looks good.  How'd you manage that?")
def main():

    input_filename = '/'.join([utils.get_dir_for('transform01'), settings.TRANSFORM01_FILE,])
    input_file = open(input_filename, 'r')

    run_id = utils.get_history_run_id_for('transform01')
    complaints_file = utils.get_complaints_for('transform01', 'w')
    complaints_file.write('Run ID: {}\n'.format(run_id))

    linenum = 0
    complaints = 0
    for pageview_json in input_file.readlines():
        linenum += 1
        if not linenum % 100:
            print('Validating line {}'.format(linenum))

        pageview = json.loads(pageview_json)

        if pageview['page']['url'] is None:
            complaints += 1
            complaints_file.write('Line {}: empty url!\n'.format(linenum))

        # if pageview['page']['title'] is None:
        #     complaints += 1
        #     complaints_file.write('Line {}: empty page title!\n'.format(linenum))

        if pageview['time']['utc'] is None:
            complaints += 1
            complaints_file.write('Line {}: missing timestamp!\n'.format(linenum))

        if pageview['tech']['ip'] is not None:
            if pageview['anon']['continent'] is None or pageview['anon']['country'] is None:
                complaints += 1
                complaints_file.write(
                    'Line {}: Have IP addr ({}), but missing continent and/or country: ({} / {})\n'.format(
                        linenum, pageview['tech']['ip'], pageview['anon']['continent'] or 'None',
                        pageview['anon']['country'] or 'None'
                    )
                )

    if complaints > 0:
        print("I got {} reasons to be mad at you.  ".format(complaints))
    else:
        print("You've done your homework, have a cookie!");
예제 #4
0
파일: load.py 프로젝트: brianjgeiger/osf.io
def load_batch_for(batch_id, domain, tally, dry_run, es_client, keen_client):
    data_dir = utils.get_dir_for('transform02')
    batch_filename = script_settings.EVENT_DATA_FILE_TEMPLATE.format(
        domain=domain, batch_id=batch_id
    )
    data_file = open(data_dir + '/' + batch_filename, 'r')
    run_id = data_file.readline().rstrip()
    events = json.loads(data_file.readline())

    if dry_run:
        actions = [{
            '_index': script_settings.ES_INDEX,
            '_type': domain + '-pageviews',
            '_source': event,
        } for event in events]

        stats = es_bulk(
            client=es_client, stats_only=True, actions=actions,
        )
        tally[domain + '-' + str(batch_id)] = stats
    else:
        keen_client.add_events({'pageviews': events})
예제 #5
0
def main():

    run_id = utils.get_history_run_id_for('transform01')
    complaints_file = utils.get_complaints_for('transform01', 'w')
    complaints_file.write('Run ID: {}\n'.format(run_id))

    linenum = 0
    complaints = 0
    with open(utils.get_dir_for('transform01') + '/' + settings.TRANSFORM01_FILE, 'r') as input_file:
        for i, pageview_json in enumerate(input_file):
            linenum = i + 1
            if not linenum % 100:
                print('Validating line {}'.format(linenum))

                pageview = json.loads(pageview_json)

                if pageview['page']['url'] is None:
                    complaints += 1
                    complaints_file.write('Line {}: empty url!\n'.format(linenum))

                if pageview['time']['utc'] is None:
                    complaints += 1
                    complaints_file.write('Line {}: missing timestamp!\n'.format(linenum))

                if pageview['tech']['ip'] is not None:
                    if pageview['anon']['continent'] is None or pageview['anon']['country'] is None:
                        complaints += 1
                        complaints_file.write(
                            'Line {}: Have IP addr ({}), but missing continent and/or country: ({} / {})\n'.format(
                                linenum, pageview['tech']['ip'], pageview['anon']['continent'] or 'None',
                                pageview['anon']['country'] or 'None'
                            )
                        )

    if complaints > 0:
        print("I got {} reasons to be mad at you.  ".format(complaints))
    else:
        print("You've done your homework, have a cookie!");
예제 #6
0
파일: load.py 프로젝트: j-p-courneya/osf.io
def load_batch_for(batch_id, domain, tally, dry_run, es_client, keen_client):
    data_dir = utils.get_dir_for('transform02')
    batch_filename = script_settings.EVENT_DATA_FILE_TEMPLATE.format(
        domain=domain, batch_id=batch_id)
    events = []
    with open(data_dir + '/' + batch_filename, 'r') as data_file:
        run_id = data_file.readline().rstrip()
        events = json.loads(data_file.readline())

    if dry_run:
        actions = [{
            '_index': script_settings.ES_INDEX,
            '_type': domain + '-pageviews',
            '_source': event,
        } for event in events]

        stats = es_bulk(
            client=es_client,
            stats_only=True,
            actions=actions,
        )
        tally[domain + '-' + str(batch_id)] = stats
    else:
        keen_client.add_events({'pageviews': events})
예제 #7
0
파일: load.py 프로젝트: atelic/osf.io
def main(dry_run=True, batch_count=None, force=False):
    """Upload the pageviews to Keen.
    """

    history_run_id = utils.get_history_run_id_for('transform02')
    complaints_run_id = utils.get_complaints_run_id_for('transform02')
    if history_run_id != complaints_run_id:
        print("You need to validate your first-phase transformed data! Bailing...")
        sys.exit()

    extract_complaints = utils.get_complaints_for('transform02', 'r')
    extract_complaints.readline()  # toss header
    if extract_complaints.readline():
        print("You have unaddressed complaints in your second-phase transform!")
        if not force:
            print("  ...pass --force to ignore")
            sys.exit()


    history_file = utils.get_history_for('load', 'a')
    history_file.write(script_settings.RUN_HEADER + '{}\n'.format(complaints_run_id))
    history_file.write('Beginning upload at: {}Z\n'.format(datetime.utcnow()))

    keen_clients = {'public': None, 'private': None}
    es_client = None
    if dry_run:
        print("Doing dry-run upload to Elastic search.  Pass --for-reals to upload to Keen")
        es_client = Elasticsearch()
        try:
            es_client.indices.delete(script_settings.ES_INDEX)
        except Exception as exc:
            print(exc)
            pass
    else:
        keen_clients = {
            'public': KeenClient(
                project_id=settings.KEEN['public']['project_id'],
                write_key=settings.KEEN['public']['write_key'],
            ),
            'private':  KeenClient(
                project_id=settings.KEEN['private']['project_id'],
                write_key=settings.KEEN['private']['write_key'],
            )
        }

    tally = {}
    seen = {}

    try:
        with open(utils.get_dir_for('load') + '/resume.log', 'r') as resume_log:
            for seen_file in resume_log.readlines():
                seen[seen_file.strip('\n')] = 1
    except:
        pass

    batch_count = utils.get_batch_count() if batch_count is None else batch_count
    print("Beginning Upload")
    with open(utils.get_dir_for('load') + '/resume.log', 'a', 0) as resume_log:
        for batch_id in range(1, batch_count+1):
            print("  Batch {}".format(batch_id))
            for domain in ('private', 'public'):
                print("    Domain: {}".format(domain))
                file_id = '{}-{}'.format(domain, batch_id)
                if file_id in seen.keys():
                    print("  ...seen, skipping.\n")
                    continue

                history_file.write('Uploading for {} project, batch {}'.format(domain, batch_id))
                load_batch_for(batch_id, domain, tally, dry_run, es_client, keen_clients[domain])
                resume_log.write('{}\n'.format(file_id))
                history_file.write('  ...finished\n')

    print("Finished Upload")
    history_file.write('Finished upload at: {}Z\n'.format(datetime.utcnow()))
    history_file.write('Tally was:\n')
    for k, v in sorted(tally.items()):
        history_file.write('  {}: {}\n'.format(k, v))
예제 #8
0
def main():
    """This script extracts pageview data from the OSF Piwik db and outputs the results
    to a dumpfile.
    """

    try:
        mysql_db = MySQLdb.connect(host=settings.PIWIK_DB_HOST,
                                   port=settings.PIWIK_DB_PORT,
                                   user=settings.PIWIK_DB_USER,
                                   passwd=settings.PIWIK_DB_PASSWORD,
                                   db=settings.PIWIK_DB_NAME)
    except MySQLdb.Error as err:
        print "MySQL Error [%d]: %s" % (err.args[0], err.args[1])
        raise err

    my_cursor = mysql_db.cursor(MySQLdb.cursors.DictCursor)
    my_cursor.execute("SET NAMES 'utf8'")
    my_cursor.execute(
        "SET sql_mode = 'ERROR_FOR_DIVISION_BY_ZERO,NO_AUTO_CREATE_USER,NO_AUTO_VALUE_ON_ZERO,"
        "NO_ZERO_DATE,NO_ZERO_IN_DATE';")

    history_file = utils.get_history_for('extract', 'w')
    output_file = open(
        utils.get_dir_for('extract') + '/' + settings.EXTRACT_FILE, 'w')

    count = 0
    last_count = 0

    history_file.write(settings.RUN_HEADER + '{}\n'.format(uuid.uuid4()))
    history_file.write('Beginning extraction at: {}Z\n'.format(
        datetime.utcnow()))
    visit_cursor = get_visits(mysql_db)
    visit = visit_cursor.fetchone()
    while visit is not None:
        visit['tz_offset'] = calculate_tz_offset(
            str(visit['visitor_localtime']),
            str(visit['first_contact_server_time']))

        action_cursor = get_actions_for_visit(mysql_db, visit['idvisit'])
        action = action_cursor.fetchone()

        action_count = 0
        while action is not None:
            action_count += 1
            referrer_url = None
            if action_count == 1 and visit['referer_type'] in (
                    2,
                    3,
            ):
                referrer_url = visit['referer_url']
            elif action['previous_url']:
                url_scheme = '' if action[
                    'previous_url_prefix'] is None else URL_PREFIX[
                        action['previous_url_prefix']]
                referrer_url = url_scheme + action['previous_url']

            # piwik stores searches weird.
            if action['page_title_type'] and action['page_title_type'] == 8:
                action['page_url'] = 'staging.osf.io/search/?q=' + action[
                    'page_title']
                action['page_url_prefix'] = 2
                action['page_title'] = 'OSF | Search'

            pageview = {
                'visit': {
                    'id':
                    visit['idvisit'],
                    'visitor_id':
                    b2a_hex(visit['idvisitor']),
                    'visitor_returning':
                    visit['visitor_returning'],
                    'ip_addr':
                    None if visit['location_ip'] == NULL_IP else inet_ntoa(
                        visit['location_ip']),
                    'user_id':
                    visit['user_id'],
                    'tz_offset':
                    visit['tz_offset'],
                    'ua': {
                        'os': visit['config_os'],
                        'os_version': None,
                        'browser': {
                            'version': visit['config_browser_version'],
                            'name': visit['config_browser_name'],
                            'cookies': visit['config_cookie'],
                            'locale': visit['location_browser_lang'],
                        },
                        'screen': visit['config_resolution'],
                        'device': None,
                    },
                },
                'action': {
                    'id': action['visit_action_id'],
                    'parent_node_id': action['parent_node_id'],
                    'node_id': action['node_id'],
                    'node_tags': action['node_tags'],
                    'page': {
                        'url':
                        action['page_url'],
                        'url_prefix':
                        None if action['page_url_prefix'] is None else
                        URL_PREFIX[action['page_url_prefix']],
                        'url_id':
                        action['page_url_id'],
                        'url_type':
                        action['page_url_type'],
                        'title':
                        action['page_title'],
                        'title_id':
                        action['page_title_id'],
                        'title_type':
                        action['page_title_type'],
                        'is_search':
                        True if action['page_title_type']
                        and action['page_title_type'] == 8 else False,
                    },
                    'referrer': referrer_url,
                    'timestamp': str(action['server_time']),
                },
            }

            output_file.write(json.dumps(pageview) + '\n')
            history_file.write('\tLast action written timestamp: ' +
                               str(action['server_time']) + '\n')
            count += 1
            action = action_cursor.fetchone()

        visit = visit_cursor.fetchone()

    history_file.write('Finished extraction at: {}Z\n'.format(
        datetime.utcnow()))
    history_file.write('Final count was: {}\n'.format(count))
    print("Final count is: {}".format(count))
    history_file.close()
    output_file.close()

    mysql_db.close()
예제 #9
0
def main(force=False):
    history_run_id = utils.get_history_run_id_for('extract')
    complaints_run_id = utils.get_complaints_run_id_for('extract')
    if history_run_id != complaints_run_id:
        print('You need to validate your exported data! Bailing...')
        sys.exit()

    extract_complaints = utils.get_complaints_for('extract', 'r')
    extract_complaints.readline()  # toss header
    if extract_complaints.readline():
        print('You have unaddressed complaints!')
        if not force:
            print('  ...pass --force to ignore')
            sys.exit()
    extract_complaints.close()

    sqlite_db = sqlite3.connect(settings.SQLITE_PATH)
    sqlite_db.row_factory = sqlite3.Row
    sqlite_setup(sqlite_db)

    transform_dir = utils.get_dir_for('transform01')

    logger.info('Run ID: {}\n'.format(complaints_run_id))
    logger.info('Beginning extraction at: {}Z\n'.format(datetime.utcnow()))
    tally = {'missing_user': 0, 'missing_node': 0}
    lastline = 0
    try:
        with open(utils.get_dir_for('transform01') + '/resume.log', 'r') as fp:
            fp.seek(-32, 2)
            lastline = int(fp.readlines()[-1].strip('\n'))
    except IOError:
        pass

    with open(utils.get_dir_for('transform01') + '/resume.log', 'a', 0) as resume_file:  # Pass 0 for unbuffered writing
        with open(transform_dir + '/' + settings.TRANSFORM01_FILE, 'a') as output_file:
            with open(utils.get_dir_for('extract') + '/' + settings.EXTRACT_FILE, 'r') as input_file:
                print('Lastline is: {}\n'.format(lastline))
                for i, pageview_json in enumerate(input_file):
                    linenum = i + 1
                    if linenum <= lastline:
                        if not linenum % 1000:
                            print('Skipping line {} of ***{}***'.format(linenum, lastline))
                        continue

                    if not linenum % 1000:
                        print('Transforming line {}'.format(linenum))

                    raw_pageview = json.loads(pageview_json)
                    visit = raw_pageview['visit']
                    action = raw_pageview['action']

                    # lookup location by ip address. piwik strips last 16 bits, so may not be completely
                    # accurate, but should be close enough.
                    ip_addr = visit['ip_addr']
                    location = get_location_for_ip_addr(ip_addr, sqlite_db)

                    # user has many visitor ids, visitor id has many session ids.
                    # in keen, visitor id will refresh 1/per year, session 1/per 30min.
                    visitor_id = get_or_create_visitor_id(visit['visitor_id'], sqlite_db)
                    session_id = get_or_create_session_id(visit['id'], sqlite_db)

                    user_id = visit['user_id']
                    user = get_or_create_user(user_id, sqlite_db)

                    node_id = action['node_id']
                    node = get_or_create_node(node_id, sqlite_db)

                    browser_version = [None, None]
                    if visit['ua']['browser']['version']:
                        browser_version = visit['ua']['browser']['version'].split('.')

                    os_version = [None, None]
                    if visit['ua']['os_version']:
                        os_version = visit['ua']['os_version'].split('.')
                        if len(os_version) == 1:
                            os_version.append(None)

                    os_family = parse_os_family(visit['ua']['os']);
                    if visit['ua']['os'] == 'WIN' and visit['ua']['os_version']:
                        os_family = os_family.replace('<Unknown Version>', visit['ua']['os_version'])

                    browser_info = {
                        'device': {
                            'family': visit['ua']['device'],
                        },
                        'os': {
                            'major': os_version[0],
                            'patch_minor': None,
                            'minor': os_version[1],
                            'family': os_family,
                            'patch': None,
                        },
                        'browser': {
                            'major': browser_version[0],
                            'minor': browser_version[1],
                            'family': parse_browser_family(visit['ua']['browser']['name']),
                            'patch': None,
                        },
                    }

                    if '-' in visit['ua']['browser']['locale']:
                        browser_locale = visit['ua']['browser']['locale'].split('-')
                        browser_language = '-'.join([browser_locale[0], browser_locale[1].upper()])

                    node_tags = None if action['node_tags'] is None else [
                        tag for tag in action['node_tags'].split(',')
                    ]

                    # piwik stores resolution as 1900x600 mostly, but sometimes as a float?
                    # For the sake of my sanity and yours, let's ignore floats.
                    screen_resolution = (None, None)
                    if re.search('x', visit['ua']['screen']):
                        screen_resolution = visit['ua']['screen'].split('x')

                    # piwik fmt: '2016-05-11 20:30:00', keen fmt: '2016-06-30T17:12:50.070Z'
                    # piwik is always utc
                    utc_timestamp = datetime.strptime(action['timestamp'], '%Y-%m-%d %H:%M:%S')
                    utc_ts_formatted = utc_timestamp.isoformat() + '.000Z'  # naive, but correct

                    local_timedelta = timedelta(minutes=visit['tz_offset'])
                    local_timestamp = utc_timestamp + local_timedelta

                    pageview = {
                        'meta': {
                            'epoch': 0,  # migrated from piwik
                        },
                        'page': {
                            'title': action['page']['title'],
                            'url': action['page']['url_prefix'] + action['page']['url'] if action['page']['url'] is not None else None,
                            'info': {}  # (add-on)
                        },
                        'referrer': {
                            'url': action['referrer'] or None,
                            'info': {},  # (add-on)
                        },
                        'tech': {
                            'browser': {  # JS-side will be filled in by Keen.helpers.getBrowserProfile()
                                'cookies': True if visit['ua']['browser']['cookies'] else False,
                                'language': browser_language,
                                'screen': {
                                    'height': screen_resolution[1],
                                    'width': screen_resolution[0],
                                },
                            },
                            'ip': ip_addr,  # private
                            'ua': None,
                            'info': browser_info,
                        },
                        'time': {
                            'utc': timestamp_components(utc_timestamp),
                            'local': timestamp_components(local_timestamp),
                        },
                        'visitor': {
                            'id': visitor_id,
                            'session': session_id,
                            'returning': True if visit['visitor_returning'] else False,  # visit
                        },
                        'user': {
                            'id': user_id,
                            'entry_point': '' if user is None else user['entry_point'],  # empty string if no user
                            'locale': '' if user is None else user['locale'],  # empty string if no user
                            'timezone': '' if user is None else user['timezone'],  # empty string if no user
                            'institutions': None if user is None else user['institutions'],  # null if no user, else []
                        },
                        'node': {
                            'id': node_id,
                            'title': None if node is None else node['title'],
                            'type': None if node is None else node['category'],
                            'tags': node_tags,
                            'made_public_date': None if node is None else node['made_public_date'],
                        },
                        'geo': {},
                        'anon': {
                            'id': md5(session_id).hexdigest(),
                            'continent': None if location is None else location['continent'],
                            'country': None if location is None else location['country'],
                        },
                        'keen': {
                            'timestamp': utc_ts_formatted,
                            'addons': [
                                {
                                    'name': 'keen:referrer_parser',
                                    'input': {
                                        'referrer_url': 'referrer.url',
                                        'page_url': 'page.url'
                                    },
                                    'output': 'referrer.info'
                                },
                                {
                                    'name': 'keen:url_parser',
                                    'input': {
                                        'url': 'page.url'
                                    },
                                    'output': 'page.info'
                                },
                                {
                                    'name': 'keen:url_parser',
                                    'input': {
                                        'url': 'referrer.url'
                                    },
                                    'output': 'referrer.info'
                                },
                                {  # private
                                    'name': 'keen:ip_to_geo',
                                    'input': {
                                        'ip': 'tech.ip'
                                    },
                                    'output': 'geo',
                                }
                            ],
                        }
                    }

                    if node_id is None:
                        tally['missing_node'] += 1

                    if user_id is None:
                        tally['missing_user'] += 1

                    output_file.write(json.dumps(pageview) + '\n')
                    resume_file.write(str(linenum) + '\n')

    logger.info('Finished extraction at: {}Z\n'.format(datetime.utcnow()))
    logger.info('Final count was: {}\n'.format(linenum))
    logger.info('{} pageviews lacked a user id.\n'.format(tally['missing_user']))
    logger.info('{} pageviews lacked a node id.\n'.format(tally['missing_node']))
    sqlite_db.close()
예제 #10
0
def main(force=False):

    history_run_id = utils.get_history_run_id_for('transform01')
    complaints_run_id = utils.get_complaints_run_id_for('transform01')
    if history_run_id != complaints_run_id:
        print("You need to validate your first-phase transformed data! Bailing...")
        sys.exit()

    extract_complaints = utils.get_complaints_for('transform01', 'r')
    extract_complaints.readline()  # toss header
    if extract_complaints.readline():
        print("You have unaddressed complaints in your first-phase transform!")
        if not force:
            print("  ...pass --force to ignore")
            sys.exit()


    history_file = utils.get_history_for('transform02', 'w')
    history_file.write('Run ID: {}\n'.format(complaints_run_id))
    history_file.write('Beginning extraction at: {}Z\n'.format(datetime.utcnow()))

    transform_dir = utils.get_dir_for('transform02')
    public_template = transform_dir + '/public-{0:04d}.data'
    private_template = transform_dir + '/private-{0:04d}.data'

    lastline = 0
    try:
        with open(utils.get_dir_for('transform02') + '/resume.log', 'r') as fp:
            fp.seek(-32, 2)
            lastline = int(fp.readlines()[-1].strip('\n'))
    except IOError:
        pass


    linenum = 0
    batchnum = 0
    public_pageviews = []
    private_pageviews = []

    with open(transform_dir + '/resume.log', 'a', 0) as resume_file:  # Pass 0 for unbuffered writing
        with open(utils.get_dir_for('transform01') + '/' + settings.TRANSFORM01_FILE, 'r') as input_file:
            print('Lastline is: {}\n'.format(lastline))
            for i, pageview_json in enumerate(input_file):
                linenum = i + 1
                if linenum <= lastline:
                    if not linenum % 1000:
                        print('Skipping line {} of ***{}***'.format(linenum, lastline))
                    continue

                if not linenum % 1000:
                    print('Batching line {}'.format(linenum))

                pageview = json.loads(pageview_json)
                made_public_date = pageview['node']['made_public_date']
                del pageview['node']['made_public_date']

                private_pageviews.append(pageview)

                # only pageviews logged after the most recent make public date are copied to public
                # collection
                if made_public_date is not None and made_public_date < pageview['keen']['timestamp']:
                    public_pageview = copy.deepcopy(pageview)

                    for private_property in ('tech', 'user', 'visitor', 'geo' ):
                        del public_pageview[private_property]

                    for addon in public_pageview['keen']['addons']:
                        if addon['name'] in ('keen:ip_to_geo', 'keen:ua_parser'):
                            public_pageview['keen']['addons'].remove(addon)

                    public_pageviews.append(public_pageview)

                if linenum % settings.BATCH_SIZE == 0:
                    batchnum += 1
                    write_batch(batchnum, complaints_run_id, 'public', public_pageviews, transform_dir)
                    write_batch(batchnum, complaints_run_id, 'private', private_pageviews, transform_dir)
        
        if linenum % settings.BATCH_SIZE != 0:
            batchnum += 1
            write_batch(batchnum, complaints_run_id, 'public', public_pageviews, transform_dir)
            write_batch(batchnum, complaints_run_id, 'private', private_pageviews, transform_dir)

    history_file.write(settings.BATCH_HEADER + '{}\n'.format(batchnum))
예제 #11
0
파일: load.py 프로젝트: j-p-courneya/osf.io
def main(dry_run=True, batch_count=None, force=False):
    """Upload the pageviews to Keen.
    """

    history_run_id = utils.get_history_run_id_for('transform02')
    complaints_run_id = utils.get_complaints_run_id_for('transform02')
    if history_run_id != complaints_run_id:
        print(
            "You need to validate your first-phase transformed data! Bailing..."
        )
        sys.exit()

    extract_complaints = utils.get_complaints_for('transform02', 'r')
    extract_complaints.readline()  # toss header
    if extract_complaints.readline():
        print(
            "You have unaddressed complaints in your second-phase transform!")
        if not force:
            print("  ...pass --force to ignore")
            sys.exit()

    history_file = utils.get_history_for('load', 'a')
    history_file.write(script_settings.RUN_HEADER +
                       '{}\n'.format(complaints_run_id))
    history_file.write('Beginning upload at: {}Z\n'.format(datetime.utcnow()))

    keen_clients = {'public': None, 'private': None}
    es_client = None
    if dry_run:
        print(
            "Doing dry-run upload to Elastic search.  Pass --for-reals to upload to Keen"
        )
        es_client = Elasticsearch()
        try:
            es_client.indices.delete(script_settings.ES_INDEX)
        except Exception as exc:
            print(exc)
            pass
    else:
        keen_clients = {
            'public':
            KeenClient(
                project_id=settings.KEEN['public']['project_id'],
                write_key=settings.KEEN['public']['write_key'],
            ),
            'private':
            KeenClient(
                project_id=settings.KEEN['private']['project_id'],
                write_key=settings.KEEN['private']['write_key'],
            )
        }

    tally = {}
    seen = {}

    try:
        with open(utils.get_dir_for('load') + '/resume.log',
                  'r') as resume_log:
            for seen_file in resume_log.readlines():
                seen[seen_file.strip('\n')] = 1
    except:
        pass

    batch_count = utils.get_batch_count(
    ) if batch_count is None else batch_count
    print("Beginning Upload")
    with open(utils.get_dir_for('load') + '/resume.log', 'a', 0) as resume_log:
        for batch_id in range(1, batch_count + 1):
            print("  Batch {}".format(batch_id))
            for domain in ('private', 'public'):
                print("    Domain: {}".format(domain))
                file_id = '{}-{}'.format(domain, batch_id)
                if file_id in seen.keys():
                    print("  ...seen, skipping.\n")
                    continue

                history_file.write('Uploading for {} project, batch {}'.format(
                    domain, batch_id))
                load_batch_for(batch_id, domain, tally, dry_run, es_client,
                               keen_clients[domain])
                resume_log.write('{}\n'.format(file_id))
                history_file.write('  ...finished\n')

    print("Finished Upload")
    history_file.write('Finished upload at: {}Z\n'.format(datetime.utcnow()))
    history_file.write('Tally was:\n')
    for k, v in sorted(tally.items()):
        history_file.write('  {}: {}\n'.format(k, v))
예제 #12
0
def main(force=False):

    history_run_id = utils.get_history_run_id_for('transform01')
    complaints_run_id = utils.get_complaints_run_id_for('transform01')
    if history_run_id != complaints_run_id:
        print(
            "You need to validate your first-phase transformed data! Bailing..."
        )
        sys.exit()

    extract_complaints = utils.get_complaints_for('transform01', 'r')
    extract_complaints.readline()  # toss header
    if extract_complaints.readline():
        print("You have unaddressed complaints in your first-phase transform!")
        if not force:
            print("  ...pass --force to ignore")
            sys.exit()

    history_file = utils.get_history_for('transform02', 'w')
    history_file.write('Run ID: {}\n'.format(complaints_run_id))
    history_file.write('Beginning extraction at: {}Z\n'.format(
        datetime.utcnow()))

    transform_dir = utils.get_dir_for('transform02')
    public_template = transform_dir + '/public-{0:04d}.data'
    private_template = transform_dir + '/private-{0:04d}.data'

    lastline = 0
    try:
        with open(utils.get_dir_for('transform02') + '/resume.log', 'r') as fp:
            fp.seek(-32, 2)
            lastline = int(fp.readlines()[-1].strip('\n'))
    except IOError:
        pass

    linenum = 0
    batchnum = 0
    public_pageviews = []
    private_pageviews = []

    with open(transform_dir + '/resume.log', 'a',
              0) as resume_file:  # Pass 0 for unbuffered writing
        with open(
                utils.get_dir_for('transform01') + '/' +
                settings.TRANSFORM01_FILE, 'r') as input_file:
            print('Lastline is: {}\n'.format(lastline))
            for i, pageview_json in enumerate(input_file):
                linenum = i + 1
                if linenum <= lastline:
                    if not linenum % 1000:
                        print('Skipping line {} of ***{}***'.format(
                            linenum, lastline))
                    continue

                if not linenum % 1000:
                    print('Batching line {}'.format(linenum))

                pageview = json.loads(pageview_json)
                made_public_date = pageview['node']['made_public_date']
                del pageview['node']['made_public_date']

                private_pageviews.append(pageview)

                # only pageviews logged after the most recent make public date are copied to public
                # collection
                if made_public_date is not None and made_public_date < pageview[
                        'keen']['timestamp']:
                    public_pageview = copy.deepcopy(pageview)

                    for private_property in ('tech', 'user', 'visitor', 'geo'):
                        del public_pageview[private_property]

                    for addon in public_pageview['keen']['addons']:
                        if addon['name'] in ('keen:ip_to_geo',
                                             'keen:ua_parser'):
                            public_pageview['keen']['addons'].remove(addon)

                    public_pageviews.append(public_pageview)

                if linenum % settings.BATCH_SIZE == 0:
                    batchnum += 1
                    write_batch(batchnum, complaints_run_id, 'public',
                                public_pageviews, transform_dir)
                    write_batch(batchnum, complaints_run_id, 'private',
                                private_pageviews, transform_dir)

        if linenum % settings.BATCH_SIZE != 0:
            batchnum += 1
            write_batch(batchnum, complaints_run_id, 'public',
                        public_pageviews, transform_dir)
            write_batch(batchnum, complaints_run_id, 'private',
                        private_pageviews, transform_dir)

    history_file.write(settings.BATCH_HEADER + '{}\n'.format(batchnum))
예제 #13
0
def main():
    """This script extracts pageview data from the OSF Piwik db and outputs the results
    to a dumpfile.
    """

    try:
        mysql_db = MySQLdb.connect(
            host=settings.PIWIK_DB_HOST,
            port=settings.PIWIK_DB_PORT,
            user=settings.PIWIK_DB_USER,
            passwd=settings.PIWIK_DB_PASSWORD,
            db=settings.PIWIK_DB_NAME,
        )
    except MySQLdb.Error as err:
        print "MySQL Error [%d]: %s" % (err.args[0], err.args[1])
        raise err

    my_cursor = mysql_db.cursor(MySQLdb.cursors.DictCursor)
    my_cursor.execute("SET NAMES 'utf8'")
    my_cursor.execute(
        "SET sql_mode = 'ERROR_FOR_DIVISION_BY_ZERO,NO_AUTO_CREATE_USER,NO_AUTO_VALUE_ON_ZERO,"
        "NO_ZERO_DATE,NO_ZERO_IN_DATE';"
    )

    history_file = utils.get_history_for("extract", "w")
    output_file = open(utils.get_dir_for("extract") + "/" + settings.EXTRACT_FILE, "w")

    count = 0
    last_count = 0

    history_file.write(settings.RUN_HEADER + "{}\n".format(uuid.uuid4()))
    history_file.write("Beginning extraction at: {}Z\n".format(datetime.utcnow()))
    visit_cursor = get_visits(mysql_db)
    visit = visit_cursor.fetchone()
    while visit is not None:
        visit["tz_offset"] = calculate_tz_offset(
            str(visit["visitor_localtime"]), str(visit["first_contact_server_time"])
        )

        action_cursor = get_actions_for_visit(mysql_db, visit["idvisit"])
        action = action_cursor.fetchone()

        action_count = 0
        while action is not None:
            action_count += 1
            referrer_url = None
            if action_count == 1 and visit["referer_type"] in (2, 3):
                referrer_url = visit["referer_url"]
            elif action["previous_url"]:
                url_scheme = "" if action["previous_url_prefix"] is None else URL_PREFIX[action["previous_url_prefix"]]
                referrer_url = url_scheme + action["previous_url"]

            # piwik stores searches weird.
            if action["page_title_type"] and action["page_title_type"] == 8:
                action["page_url"] = "staging.osf.io/search/?q=" + action["page_title"]
                action["page_url_prefix"] = 2
                action["page_title"] = "OSF | Search"

            pageview = {
                "visit": {
                    "id": visit["idvisit"],
                    "visitor_id": b2a_hex(visit["idvisitor"]),
                    "visitor_returning": visit["visitor_returning"],
                    "ip_addr": None if visit["location_ip"] == NULL_IP else inet_ntoa(visit["location_ip"]),
                    "user_id": visit["user_id"],
                    "tz_offset": visit["tz_offset"],
                    "ua": {
                        "os": visit["config_os"],
                        "os_version": None,
                        "browser": {
                            "version": visit["config_browser_version"],
                            "name": visit["config_browser_name"],
                            "cookies": visit["config_cookie"],
                            "locale": visit["location_browser_lang"],
                        },
                        "screen": visit["config_resolution"],
                        "device": None,
                    },
                },
                "action": {
                    "id": action["visit_action_id"],
                    "parent_node_id": action["parent_node_id"],
                    "node_id": action["node_id"],
                    "node_tags": action["node_tags"],
                    "page": {
                        "url": action["page_url"],
                        "url_prefix": None
                        if action["page_url_prefix"] is None
                        else URL_PREFIX[action["page_url_prefix"]],
                        "url_id": action["page_url_id"],
                        "url_type": action["page_url_type"],
                        "title": action["page_title"],
                        "title_id": action["page_title_id"],
                        "title_type": action["page_title_type"],
                        "is_search": True if action["page_title_type"] and action["page_title_type"] == 8 else False,
                    },
                    "referrer": referrer_url,
                    "timestamp": str(action["server_time"]),
                },
            }

            output_file.write(json.dumps(pageview) + "\n")
            history_file.write("\tLast action written timestamp: " + str(action["server_time"]) + "\n")
            count += 1
            action = action_cursor.fetchone()

        visit = visit_cursor.fetchone()

    history_file.write("Finished extraction at: {}Z\n".format(datetime.utcnow()))
    history_file.write("Final count was: {}\n".format(count))
    print ("Final count is: {}".format(count))
    history_file.close()
    output_file.close()

    mysql_db.close()