示例#1
0
文件: load.py 项目: atelic/osf.io
def main(dry_run=True, batch_count=None, force=False):
    """Upload the pageviews to Keen.
    """

    history_run_id = utils.get_history_run_id_for('transform02')
    complaints_run_id = utils.get_complaints_run_id_for('transform02')
    if history_run_id != complaints_run_id:
        print("You need to validate your first-phase transformed data! Bailing...")
        sys.exit()

    extract_complaints = utils.get_complaints_for('transform02', 'r')
    extract_complaints.readline()  # toss header
    if extract_complaints.readline():
        print("You have unaddressed complaints in your second-phase transform!")
        if not force:
            print("  ...pass --force to ignore")
            sys.exit()


    history_file = utils.get_history_for('load', 'a')
    history_file.write(script_settings.RUN_HEADER + '{}\n'.format(complaints_run_id))
    history_file.write('Beginning upload at: {}Z\n'.format(datetime.utcnow()))

    keen_clients = {'public': None, 'private': None}
    es_client = None
    if dry_run:
        print("Doing dry-run upload to Elastic search.  Pass --for-reals to upload to Keen")
        es_client = Elasticsearch()
        try:
            es_client.indices.delete(script_settings.ES_INDEX)
        except Exception as exc:
            print(exc)
            pass
    else:
        keen_clients = {
            'public': KeenClient(
                project_id=settings.KEEN['public']['project_id'],
                write_key=settings.KEEN['public']['write_key'],
            ),
            'private':  KeenClient(
                project_id=settings.KEEN['private']['project_id'],
                write_key=settings.KEEN['private']['write_key'],
            )
        }

    tally = {}
    seen = {}

    try:
        with open(utils.get_dir_for('load') + '/resume.log', 'r') as resume_log:
            for seen_file in resume_log.readlines():
                seen[seen_file.strip('\n')] = 1
    except:
        pass

    batch_count = utils.get_batch_count() if batch_count is None else batch_count
    print("Beginning Upload")
    with open(utils.get_dir_for('load') + '/resume.log', 'a', 0) as resume_log:
        for batch_id in range(1, batch_count+1):
            print("  Batch {}".format(batch_id))
            for domain in ('private', 'public'):
                print("    Domain: {}".format(domain))
                file_id = '{}-{}'.format(domain, batch_id)
                if file_id in seen.keys():
                    print("  ...seen, skipping.\n")
                    continue

                history_file.write('Uploading for {} project, batch {}'.format(domain, batch_id))
                load_batch_for(batch_id, domain, tally, dry_run, es_client, keen_clients[domain])
                resume_log.write('{}\n'.format(file_id))
                history_file.write('  ...finished\n')

    print("Finished Upload")
    history_file.write('Finished upload at: {}Z\n'.format(datetime.utcnow()))
    history_file.write('Tally was:\n')
    for k, v in sorted(tally.items()):
        history_file.write('  {}: {}\n'.format(k, v))
示例#2
0
def main(force=False):
    history_run_id = utils.get_history_run_id_for('extract')
    complaints_run_id = utils.get_complaints_run_id_for('extract')
    if history_run_id != complaints_run_id:
        print('You need to validate your exported data! Bailing...')
        sys.exit()

    extract_complaints = utils.get_complaints_for('extract', 'r')
    extract_complaints.readline()  # toss header
    if extract_complaints.readline():
        print('You have unaddressed complaints!')
        if not force:
            print('  ...pass --force to ignore')
            sys.exit()
    extract_complaints.close()

    sqlite_db = sqlite3.connect(settings.SQLITE_PATH)
    sqlite_db.row_factory = sqlite3.Row
    sqlite_setup(sqlite_db)

    transform_dir = utils.get_dir_for('transform01')

    logger.info('Run ID: {}\n'.format(complaints_run_id))
    logger.info('Beginning extraction at: {}Z\n'.format(datetime.utcnow()))
    tally = {'missing_user': 0, 'missing_node': 0}
    lastline = 0
    try:
        with open(utils.get_dir_for('transform01') + '/resume.log', 'r') as fp:
            fp.seek(-32, 2)
            lastline = int(fp.readlines()[-1].strip('\n'))
    except IOError:
        pass

    with open(utils.get_dir_for('transform01') + '/resume.log', 'a', 0) as resume_file:  # Pass 0 for unbuffered writing
        with open(transform_dir + '/' + settings.TRANSFORM01_FILE, 'a') as output_file:
            with open(utils.get_dir_for('extract') + '/' + settings.EXTRACT_FILE, 'r') as input_file:
                print('Lastline is: {}\n'.format(lastline))
                for i, pageview_json in enumerate(input_file):
                    linenum = i + 1
                    if linenum <= lastline:
                        if not linenum % 1000:
                            print('Skipping line {} of ***{}***'.format(linenum, lastline))
                        continue

                    if not linenum % 1000:
                        print('Transforming line {}'.format(linenum))

                    raw_pageview = json.loads(pageview_json)
                    visit = raw_pageview['visit']
                    action = raw_pageview['action']

                    # lookup location by ip address. piwik strips last 16 bits, so may not be completely
                    # accurate, but should be close enough.
                    ip_addr = visit['ip_addr']
                    location = get_location_for_ip_addr(ip_addr, sqlite_db)

                    # user has many visitor ids, visitor id has many session ids.
                    # in keen, visitor id will refresh 1/per year, session 1/per 30min.
                    visitor_id = get_or_create_visitor_id(visit['visitor_id'], sqlite_db)
                    session_id = get_or_create_session_id(visit['id'], sqlite_db)

                    user_id = visit['user_id']
                    user = get_or_create_user(user_id, sqlite_db)

                    node_id = action['node_id']
                    node = get_or_create_node(node_id, sqlite_db)

                    browser_version = [None, None]
                    if visit['ua']['browser']['version']:
                        browser_version = visit['ua']['browser']['version'].split('.')

                    os_version = [None, None]
                    if visit['ua']['os_version']:
                        os_version = visit['ua']['os_version'].split('.')
                        if len(os_version) == 1:
                            os_version.append(None)

                    os_family = parse_os_family(visit['ua']['os']);
                    if visit['ua']['os'] == 'WIN' and visit['ua']['os_version']:
                        os_family = os_family.replace('<Unknown Version>', visit['ua']['os_version'])

                    browser_info = {
                        'device': {
                            'family': visit['ua']['device'],
                        },
                        'os': {
                            'major': os_version[0],
                            'patch_minor': None,
                            'minor': os_version[1],
                            'family': os_family,
                            'patch': None,
                        },
                        'browser': {
                            'major': browser_version[0],
                            'minor': browser_version[1],
                            'family': parse_browser_family(visit['ua']['browser']['name']),
                            'patch': None,
                        },
                    }

                    if '-' in visit['ua']['browser']['locale']:
                        browser_locale = visit['ua']['browser']['locale'].split('-')
                        browser_language = '-'.join([browser_locale[0], browser_locale[1].upper()])

                    node_tags = None if action['node_tags'] is None else [
                        tag for tag in action['node_tags'].split(',')
                    ]

                    # piwik stores resolution as 1900x600 mostly, but sometimes as a float?
                    # For the sake of my sanity and yours, let's ignore floats.
                    screen_resolution = (None, None)
                    if re.search('x', visit['ua']['screen']):
                        screen_resolution = visit['ua']['screen'].split('x')

                    # piwik fmt: '2016-05-11 20:30:00', keen fmt: '2016-06-30T17:12:50.070Z'
                    # piwik is always utc
                    utc_timestamp = datetime.strptime(action['timestamp'], '%Y-%m-%d %H:%M:%S')
                    utc_ts_formatted = utc_timestamp.isoformat() + '.000Z'  # naive, but correct

                    local_timedelta = timedelta(minutes=visit['tz_offset'])
                    local_timestamp = utc_timestamp + local_timedelta

                    pageview = {
                        'meta': {
                            'epoch': 0,  # migrated from piwik
                        },
                        'page': {
                            'title': action['page']['title'],
                            'url': action['page']['url_prefix'] + action['page']['url'] if action['page']['url'] is not None else None,
                            'info': {}  # (add-on)
                        },
                        'referrer': {
                            'url': action['referrer'] or None,
                            'info': {},  # (add-on)
                        },
                        'tech': {
                            'browser': {  # JS-side will be filled in by Keen.helpers.getBrowserProfile()
                                'cookies': True if visit['ua']['browser']['cookies'] else False,
                                'language': browser_language,
                                'screen': {
                                    'height': screen_resolution[1],
                                    'width': screen_resolution[0],
                                },
                            },
                            'ip': ip_addr,  # private
                            'ua': None,
                            'info': browser_info,
                        },
                        'time': {
                            'utc': timestamp_components(utc_timestamp),
                            'local': timestamp_components(local_timestamp),
                        },
                        'visitor': {
                            'id': visitor_id,
                            'session': session_id,
                            'returning': True if visit['visitor_returning'] else False,  # visit
                        },
                        'user': {
                            'id': user_id,
                            'entry_point': '' if user is None else user['entry_point'],  # empty string if no user
                            'locale': '' if user is None else user['locale'],  # empty string if no user
                            'timezone': '' if user is None else user['timezone'],  # empty string if no user
                            'institutions': None if user is None else user['institutions'],  # null if no user, else []
                        },
                        'node': {
                            'id': node_id,
                            'title': None if node is None else node['title'],
                            'type': None if node is None else node['category'],
                            'tags': node_tags,
                            'made_public_date': None if node is None else node['made_public_date'],
                        },
                        'geo': {},
                        'anon': {
                            'id': md5(session_id).hexdigest(),
                            'continent': None if location is None else location['continent'],
                            'country': None if location is None else location['country'],
                        },
                        'keen': {
                            'timestamp': utc_ts_formatted,
                            'addons': [
                                {
                                    'name': 'keen:referrer_parser',
                                    'input': {
                                        'referrer_url': 'referrer.url',
                                        'page_url': 'page.url'
                                    },
                                    'output': 'referrer.info'
                                },
                                {
                                    'name': 'keen:url_parser',
                                    'input': {
                                        'url': 'page.url'
                                    },
                                    'output': 'page.info'
                                },
                                {
                                    'name': 'keen:url_parser',
                                    'input': {
                                        'url': 'referrer.url'
                                    },
                                    'output': 'referrer.info'
                                },
                                {  # private
                                    'name': 'keen:ip_to_geo',
                                    'input': {
                                        'ip': 'tech.ip'
                                    },
                                    'output': 'geo',
                                }
                            ],
                        }
                    }

                    if node_id is None:
                        tally['missing_node'] += 1

                    if user_id is None:
                        tally['missing_user'] += 1

                    output_file.write(json.dumps(pageview) + '\n')
                    resume_file.write(str(linenum) + '\n')

    logger.info('Finished extraction at: {}Z\n'.format(datetime.utcnow()))
    logger.info('Final count was: {}\n'.format(linenum))
    logger.info('{} pageviews lacked a user id.\n'.format(tally['missing_user']))
    logger.info('{} pageviews lacked a node id.\n'.format(tally['missing_node']))
    sqlite_db.close()
示例#3
0
def main(dry_run=True, batch_count=None, force=False):
    """Upload the pageviews to Keen.
    """

    history_run_id = utils.get_history_run_id_for('transform02')
    complaints_run_id = utils.get_complaints_run_id_for('transform02')
    if history_run_id != complaints_run_id:
        print(
            "You need to validate your first-phase transformed data! Bailing..."
        )
        sys.exit()

    extract_complaints = utils.get_complaints_for('transform02', 'r')
    extract_complaints.readline()  # toss header
    if extract_complaints.readline():
        print(
            "You have unaddressed complaints in your second-phase transform!")
        if not force:
            print("  ...pass --force to ignore")
            sys.exit()

    history_file = utils.get_history_for('load', 'a')
    history_file.write(script_settings.RUN_HEADER +
                       '{}\n'.format(complaints_run_id))
    history_file.write('Beginning upload at: {}Z\n'.format(datetime.utcnow()))

    keen_clients = {'public': None, 'private': None}
    es_client = None
    if dry_run:
        print(
            "Doing dry-run upload to Elastic search.  Pass --for-reals to upload to Keen"
        )
        es_client = Elasticsearch()
        try:
            es_client.indices.delete(script_settings.ES_INDEX)
        except Exception as exc:
            print(exc)
            pass
    else:
        keen_clients = {
            'public':
            KeenClient(
                project_id=settings.KEEN['public']['project_id'],
                write_key=settings.KEEN['public']['write_key'],
            ),
            'private':
            KeenClient(
                project_id=settings.KEEN['private']['project_id'],
                write_key=settings.KEEN['private']['write_key'],
            )
        }

    tally = {}
    seen = {}

    try:
        with open(utils.get_dir_for('load') + '/resume.log',
                  'r') as resume_log:
            for seen_file in resume_log.readlines():
                seen[seen_file.strip('\n')] = 1
    except:
        pass

    batch_count = utils.get_batch_count(
    ) if batch_count is None else batch_count
    print("Beginning Upload")
    with open(utils.get_dir_for('load') + '/resume.log', 'a', 0) as resume_log:
        for batch_id in range(1, batch_count + 1):
            print("  Batch {}".format(batch_id))
            for domain in ('private', 'public'):
                print("    Domain: {}".format(domain))
                file_id = '{}-{}'.format(domain, batch_id)
                if file_id in seen.keys():
                    print("  ...seen, skipping.\n")
                    continue

                history_file.write('Uploading for {} project, batch {}'.format(
                    domain, batch_id))
                load_batch_for(batch_id, domain, tally, dry_run, es_client,
                               keen_clients[domain])
                resume_log.write('{}\n'.format(file_id))
                history_file.write('  ...finished\n')

    print("Finished Upload")
    history_file.write('Finished upload at: {}Z\n'.format(datetime.utcnow()))
    history_file.write('Tally was:\n')
    for k, v in sorted(tally.items()):
        history_file.write('  {}: {}\n'.format(k, v))
示例#4
0
def main(force=False):

    history_run_id = utils.get_history_run_id_for('transform01')
    complaints_run_id = utils.get_complaints_run_id_for('transform01')
    if history_run_id != complaints_run_id:
        print("You need to validate your first-phase transformed data! Bailing...")
        sys.exit()

    extract_complaints = utils.get_complaints_for('transform01', 'r')
    extract_complaints.readline()  # toss header
    if extract_complaints.readline():
        print("You have unaddressed complaints in your first-phase transform!")
        if not force:
            print("  ...pass --force to ignore")
            sys.exit()


    history_file = utils.get_history_for('transform02', 'w')
    history_file.write('Run ID: {}\n'.format(complaints_run_id))
    history_file.write('Beginning extraction at: {}Z\n'.format(datetime.utcnow()))

    transform_dir = utils.get_dir_for('transform02')
    public_template = transform_dir + '/public-{0:04d}.data'
    private_template = transform_dir + '/private-{0:04d}.data'

    lastline = 0
    try:
        with open(utils.get_dir_for('transform02') + '/resume.log', 'r') as fp:
            fp.seek(-32, 2)
            lastline = int(fp.readlines()[-1].strip('\n'))
    except IOError:
        pass


    linenum = 0
    batchnum = 0
    public_pageviews = []
    private_pageviews = []

    with open(transform_dir + '/resume.log', 'a', 0) as resume_file:  # Pass 0 for unbuffered writing
        with open(utils.get_dir_for('transform01') + '/' + settings.TRANSFORM01_FILE, 'r') as input_file:
            print('Lastline is: {}\n'.format(lastline))
            for i, pageview_json in enumerate(input_file):
                linenum = i + 1
                if linenum <= lastline:
                    if not linenum % 1000:
                        print('Skipping line {} of ***{}***'.format(linenum, lastline))
                    continue

                if not linenum % 1000:
                    print('Batching line {}'.format(linenum))

                pageview = json.loads(pageview_json)
                made_public_date = pageview['node']['made_public_date']
                del pageview['node']['made_public_date']

                private_pageviews.append(pageview)

                # only pageviews logged after the most recent make public date are copied to public
                # collection
                if made_public_date is not None and made_public_date < pageview['keen']['timestamp']:
                    public_pageview = copy.deepcopy(pageview)

                    for private_property in ('tech', 'user', 'visitor', 'geo' ):
                        del public_pageview[private_property]

                    for addon in public_pageview['keen']['addons']:
                        if addon['name'] in ('keen:ip_to_geo', 'keen:ua_parser'):
                            public_pageview['keen']['addons'].remove(addon)

                    public_pageviews.append(public_pageview)

                if linenum % settings.BATCH_SIZE == 0:
                    batchnum += 1
                    write_batch(batchnum, complaints_run_id, 'public', public_pageviews, transform_dir)
                    write_batch(batchnum, complaints_run_id, 'private', private_pageviews, transform_dir)
        
        if linenum % settings.BATCH_SIZE != 0:
            batchnum += 1
            write_batch(batchnum, complaints_run_id, 'public', public_pageviews, transform_dir)
            write_batch(batchnum, complaints_run_id, 'private', private_pageviews, transform_dir)

    history_file.write(settings.BATCH_HEADER + '{}\n'.format(batchnum))
示例#5
0
def main(force=False):

    history_run_id = utils.get_history_run_id_for('transform01')
    complaints_run_id = utils.get_complaints_run_id_for('transform01')
    if history_run_id != complaints_run_id:
        print(
            "You need to validate your first-phase transformed data! Bailing..."
        )
        sys.exit()

    extract_complaints = utils.get_complaints_for('transform01', 'r')
    extract_complaints.readline()  # toss header
    if extract_complaints.readline():
        print("You have unaddressed complaints in your first-phase transform!")
        if not force:
            print("  ...pass --force to ignore")
            sys.exit()

    history_file = utils.get_history_for('transform02', 'w')
    history_file.write('Run ID: {}\n'.format(complaints_run_id))
    history_file.write('Beginning extraction at: {}Z\n'.format(
        datetime.utcnow()))

    transform_dir = utils.get_dir_for('transform02')
    public_template = transform_dir + '/public-{0:04d}.data'
    private_template = transform_dir + '/private-{0:04d}.data'

    lastline = 0
    try:
        with open(utils.get_dir_for('transform02') + '/resume.log', 'r') as fp:
            fp.seek(-32, 2)
            lastline = int(fp.readlines()[-1].strip('\n'))
    except IOError:
        pass

    linenum = 0
    batchnum = 0
    public_pageviews = []
    private_pageviews = []

    with open(transform_dir + '/resume.log', 'a',
              0) as resume_file:  # Pass 0 for unbuffered writing
        with open(
                utils.get_dir_for('transform01') + '/' +
                settings.TRANSFORM01_FILE, 'r') as input_file:
            print('Lastline is: {}\n'.format(lastline))
            for i, pageview_json in enumerate(input_file):
                linenum = i + 1
                if linenum <= lastline:
                    if not linenum % 1000:
                        print('Skipping line {} of ***{}***'.format(
                            linenum, lastline))
                    continue

                if not linenum % 1000:
                    print('Batching line {}'.format(linenum))

                pageview = json.loads(pageview_json)
                made_public_date = pageview['node']['made_public_date']
                del pageview['node']['made_public_date']

                private_pageviews.append(pageview)

                # only pageviews logged after the most recent make public date are copied to public
                # collection
                if made_public_date is not None and made_public_date < pageview[
                        'keen']['timestamp']:
                    public_pageview = copy.deepcopy(pageview)

                    for private_property in ('tech', 'user', 'visitor', 'geo'):
                        del public_pageview[private_property]

                    for addon in public_pageview['keen']['addons']:
                        if addon['name'] in ('keen:ip_to_geo',
                                             'keen:ua_parser'):
                            public_pageview['keen']['addons'].remove(addon)

                    public_pageviews.append(public_pageview)

                if linenum % settings.BATCH_SIZE == 0:
                    batchnum += 1
                    write_batch(batchnum, complaints_run_id, 'public',
                                public_pageviews, transform_dir)
                    write_batch(batchnum, complaints_run_id, 'private',
                                private_pageviews, transform_dir)

        if linenum % settings.BATCH_SIZE != 0:
            batchnum += 1
            write_batch(batchnum, complaints_run_id, 'public',
                        public_pageviews, transform_dir)
            write_batch(batchnum, complaints_run_id, 'private',
                        private_pageviews, transform_dir)

    history_file.write(settings.BATCH_HEADER + '{}\n'.format(batchnum))