Exemplo n.º 1
0
def test_toItem():
  visitor = Visitor( visitor_id, 1 )
  assert visitor.toItem() == {
    'PK': { 'S': f'VISITOR#{ visitor_id }' },
    'SK': { 'S': '#VISITOR' },
    'Type': { 'S': 'visitor' },
    'NumberSessions': { 'N': '1' }
  }
Exemplo n.º 2
0
def test_key():
    visitor = Visitor('0.0.0.0', 1)
    assert visitor.key() == {
        'PK': {
            'S': 'VISITOR#0.0.0.0'
        },
        'SK': {
            'S': '#VISITOR'
        }
    }
Exemplo n.º 3
0
def test_toItem():
    visitor = Visitor('0.0.0.0', 1)
    assert visitor.toItem() == {
        'PK': {
            'S': 'VISITOR#0.0.0.0'
        },
        'SK': {
            'S': '#VISITOR'
        },
        'Type': {
            'S': 'visitor'
        },
        'NumberSessions': {
            'N': '1'
        }
    }
Exemplo n.º 4
0
def test_processPages(table_name):
    ip = randomIP()
    this_event = event(ip, table_name)
    visits = [
        itemToVisit(record['dynamodb']['NewImage'])
        for record in this_event['Records']
        if record['dynamodb']['NewImage']['Type']['S'] == 'visit'
    ]
    client = DynamoClient(table_name)
    client.addVisitor(Visitor(ip))
    client.addVisits(visits)
    client.addBrowsers([
        itemToBrowser(record['dynamodb']['NewImage'])
        for record in this_event['Records']
        if record['dynamodb']['NewImage']['Type']['S'] == 'browser'
    ])
    for session in [
            itemToSession(record['dynamodb']['NewImage'])
            for record in this_event['Records']
            if record['dynamodb']['NewImage']['Type']['S'] == 'session'
    ]:
        client.addSession(session)
    assert processPages( client, this_event ) == 'Successfully added ' + \
      f'{ len( { visit.slug for visit in visits } ) } pages and updated 0 ' + \
      f'from { len( visits ) } records.'
Exemplo n.º 5
0
def _createNewVisitor(ip, browsers, visits, dynamo_client):
    '''Adds new Visitor data from a visitor-specific DataFrame to the table.

  Parameters
  ----------
  ip : str
    The IP address of the visitor.
  v_df : pd.DataFrame
    The visitor-specific DataFrame that holds the session's data.
  dynamo_client : DynamoClient
    The DynamoDB client used to access the table

  Returns
  -------
  result : dict
    The result of adding the new visitor and their data to the table. This
    could be new visitor, location, browser, session, and visits added or the
    error that occurred.
  '''
    result = dynamo_client.addNewVisitor(
        Visitor(ip, 1),  # Visitor
        requestToLocation(
            json.loads(
                http.request(
                    'GET',
                    f'''https://geo.ipify.org/api/v1?apiKey={ os.environ.get('IPIFY_KEY')
        }&ipAddress={ ip }''').data.decode('utf8'))),  # Location
        browsers,  # Browsers
        visits  # Visits
    )
    if 'error' in result.keys():
        print('ERROR _createNewSession ' + result['error'])
    return result
Exemplo n.º 6
0
def _addSessionToVisitor(ip, visits, browsers, dynamo_client):
    '''Creates a new Session with the data from a visitor-specific DataFrame.

  Parameters
  ----------
  ip : str
    The IP address of the visitor.
  v_df : pd.DataFrame
    The visitor-specific DataFrame that holds the session's data.
  dynamo_client : DynamoClient
    The DynamoDB client used to access the table.
  visits : list[ Visit ]
    The list of visits found in the parquet file.

  Returns
  -------
  result : dict
    The result of adding the new visitor and their data to the table. This
    could be new visitor, location, browser, session, and visits added or the
    error that occurred.
  '''
    result = dynamo_client.addNewSession(
        Visitor(ip),  # Visitor
        browsers,  # Browsers
        visits  # Visits
    )
    if 'error' in result.keys():
        print('ERROR _addSessionToVisitor ' + result['error'])
    return result
Exemplo n.º 7
0
def _updateSessions(oldSessions, visits, dynamo_client):
    '''Updates multiple sessions and visits to be a single session.

  Parameters
  ----------
  oldSessions : list[ Session ]
    The old sessions that have been found to be close enough to be combined
    into a single session.
  visits : list[ Visit ]
    The visits found in the '.parquet' file. These are combined with the visits
    in the other sessions.
  dynamo_client : DynamoClient
    The DynamoDB client used to access the table.

  Returns
  -------
  result : dict
    The result of combining the sessions and updating the visits in the table.
    These could be the updated session and visits or the error that occurred
    while accessing the table.
  '''
    # Create a list of all of the visits from the old sessions.
    old_visits = []
    for session in oldSessions:
        session_details = dynamo_client.getSessionDetails(session)
        if 'error' in session_details.keys():
            return {'error': session_details['error']}
        old_visits += session_details['visits']
    # Remove the unnecessary sessions from the table.
    for session in oldSessions[1:]:
        dynamo_client.removeSession(session)
        dynamo_client.decrementVisitorSessions(Visitor(session.ip))
    # The visits must be combined and assigned the correct attributes before
    # adding them to the table. Combine the previous visits with the ones in the
    # last session and reassign their attributes.
    combined_visits = processVisits(visits + old_visits)
    # Update the previous session to have the attributes with the new
    # visits.
    oldSessions[0].avgTime = np.mean([
        visit.timeOnPage for visit in combined_visits
        if isinstance(visit.timeOnPage, float)
    ])
    oldSessions[0].totalTime = (combined_visits[-1].date -
                                combined_visits[0].date).total_seconds()
    # Add the updated session and visits to the table.
    result = dynamo_client.updateSession(oldSessions[0], combined_visits)
    if 'error' in result.keys():
        print('ERROR _updateSession ' + result['error'])
    return result
Exemplo n.º 8
0
def test_key():
  visitor = Visitor( visitor_id, 1 )
  assert visitor.key() == {
    'PK': { 'S': f'VISITOR#{ visitor_id }' },
    'SK': { 'S': '#VISITOR' }
  }
Exemplo n.º 9
0
def test_numberSessions_init():
  visitor = Visitor( visitor_id, 1 )
  assert visitor.id == visitor_id
  assert visitor.numberSessions == 1
Exemplo n.º 10
0
def processParquet(key, dynamo_client, s3_client):
    '''Adds the data from a '.parquet' file to the DynamoDB table.

  Parameters
  ----------
  key : str
    The key of the '.parquet' file in the S3 bucket.
  dynamo_client : DynamoClient
    The DynamoDB client used to store the transformed data.
  s3_client : S3Client
    The S3 client used to get the '.parquet' file from.
  '''
    try:
        request = s3_client.getObject(key)
        # Read the parquet file as a pandas DF
        df = pd.read_parquet(io.BytesIO(request['Body'].read()))
        # Get the unique IP addresses
        ips = df['ip'].unique()
        # Iterate over the IP addresses to organize the DF's per visitor
        for ip in ips:
            # Get the visitor details from the table.
            visitor_details = dynamo_client.getVisitorDetails(Visitor(ip))
            # Get the browsers and visits of the specific IP address.
            visitor_dict = processDF(df, ip)
            # When the visitor is not found in the database, the visitor, location,
            # browser, session, and visits must be added to the database.
            if 'error' in visitor_details.keys() \
              and visitor_details['error'] == 'Visitor not in table':
                # Add the new visitor and their data to the table
                _createNewVisitor(ip, visitor_dict['browsers'],
                                  visitor_dict['visits'], dynamo_client)
            # Otherwise, determine whether to add a new session, update a visitor's
            # session, or combine multiple sessions.
            else:
                # Skip the session when the session is already in the table.
                if Session(visitor_dict['visits'][0].date, ip, 0, 0).key() in [
                        session.key()
                        for session in visitor_details['sessions']
                ]:
                    continue
                # Calculate the time deltas of the different sessions and the visitor's
                # first visit.
                time_deltas = [
                  (
                    visitor_dict['visits'][0].date - \
                    session.sessionStart + \
                    datetime.timedelta( seconds=session.totalTime ) \
                      if session.totalTime is not None \
                      else visitor_dict['visits'][0].date - session.sessionStart
                  )
                  for session in visitor_details['sessions']
                ]
                # Find all sessions that have the timedelta of less than 30 minutes on
                # the same day.
                sessions_to_update = [
                    visitor_details['sessions'][index]
                    for index in range(len(time_deltas))
                    if time_deltas[index].days < 1 and time_deltas[index].days
                    >= 0 and time_deltas[index].seconds /
                    (60 * 60) < 0.5 and time_deltas[index].seconds > 0
                ]
                # Update the visitor's session when only 1 session is found to be
                # within the timedelta.
                if len(sessions_to_update) == 1:
                    _updateSession(sessions_to_update[0],
                                   visitor_dict['visits'], dynamo_client)
                elif len(sessions_to_update) > 1:
                    _updateSessions(sessions_to_update, visitor_dict['visits'],
                                    dynamo_client)
                # Create a new session when the time between the last session and the
                # first of these visits is greater than 30 minutes.
                else:
                    _addSessionToVisitor(ip, visitor_dict['visits'],
                                         visitor_dict['browsers'],
                                         dynamo_client)
    except Exception as e:
        print(f'ERROR processParquet { e }')
        print(
          f'Error getting object { key } from bucket { s3_client.bucketname }.' + \
            ' Make sure they exist and your bucket is in the same region as ' + \
            'this function.'
        )
        raise e
Exemplo n.º 11
0
def test_itemToVisitor():
  visitor = Visitor( visitor_id, 1 )
  newVisitor = itemToVisitor( visitor.toItem() )
  assert newVisitor.id == visitor.id
  assert newVisitor.numberSessions == visitor.numberSessions
Exemplo n.º 12
0
def test_repr():
    visitor = Visitor('0.0.0.0', 1)
    assert repr(visitor) == '0.0.0.0 - 1'
Exemplo n.º 13
0
def visitor():
    '''A proper Visit object.'''
    return Visitor('0.0.0.0')
Exemplo n.º 14
0
def test_repr():
  visitor = Visitor( visitor_id, 1 )
  assert repr( visitor ) == f'{ visitor_id } - 1'
Exemplo n.º 15
0
def visitor():
    return Visitor('0.0.0.0')
Exemplo n.º 16
0
def test_itemToVisitor():
    visitor = Visitor('0.0.0.0', 1)
    newVisitor = itemToVisitor(visitor.toItem())
    assert newVisitor.ip == visitor.ip
    assert newVisitor.numberSessions == visitor.numberSessions
Exemplo n.º 17
0
def test_dict():
    visitor = Visitor('0.0.0.0', 1)
    assert dict(visitor) == {'ip': '0.0.0.0', 'numberSessions': 1}
Exemplo n.º 18
0
def test_pk():
  visitor = Visitor( visitor_id, 1 )
  assert visitor.pk() == { 'S': f'VISITOR#{ visitor_id }' }
Exemplo n.º 19
0
def test_pk():
    visitor = Visitor('0.0.0.0', 1)
    assert visitor.pk() == {'S': 'VISITOR#0.0.0.0'}
Exemplo n.º 20
0
def visitor():
  return Visitor( visitor_id )
Exemplo n.º 21
0
def test_default_init():
    visitor = Visitor('0.0.0.0')
    assert visitor.ip == '0.0.0.0'
    assert visitor.numberSessions == 0
Exemplo n.º 22
0
def test_dict():
  visitor = Visitor( visitor_id, 1 )
  assert dict( visitor ) == {
    'id': visitor_id,
    'numberSessions': 1
  }
Exemplo n.º 23
0
def s3_processor(event, context):
  """[summary]

  Args:
      event ([type]): [description]
      context ([type]): [description]

  Returns:
      [type]: [description]
  """  
  new = 0
  updated = 0
  additional = 0
  # Get the necessary data from the S3 event.
  key = urllib.parse.unquote_plus(
    event['Records'][0]['s3']['object']['key'], encoding='utf-8'
  )
  aws_region = event['Records'][0]['awsRegion']
  bucket_name = event['Records'][0]['s3']['bucket']['name']
  # Create the necessary clients
  dynamo_client = DynamoClient( os.environ['TABLE_NAME'], aws_region )
  s3_client = S3Client( bucket_name, aws_region )
  # Parse the record to get the browsers, visits, and session.
  record = processDF( key, s3_client )
  # Get the visitor from the table
  visitor_details = dynamo_client.getVisitorDetails( 
    Visitor( record['session'].id ) 
  )
  # Add the visitor, visits, session, and browsers if the visitor is not in 
  # the table.
  if not 'visitor' in visitor_details:
    dynamo_client.addVisitor( Visitor( record['session'].id ) )
    dynamo_client.addSession( record['session'] )
    dynamo_client.addVisits( record['visits'] )
    dynamo_client.addBrowsers( record['browsers'] ) 
    new += 1
  # Check to see if the last session can be combined with the one in this
  # record.
  else:
    last_session = visitor_details['sessions'][-1]
    last_sessions_visits = [ 
      visit for visit in visitor_details['visits'] 
      if visit.sessionStart == last_session.sessionStart
    ]
    # Combine the visits and update the session when the last session was
    # less than 30 minutes from this record,
    if (
      (
        last_sessions_visits[-1].date - record['visits'][0].date
      ).total_seconds() < 60 * 30
    ):
      # Update all of the record's with the previous session start
      for visit in record['visits']:
        visit.sessionStart = last_session.sessionStart
      # Update the last visit of the last session when the first visit of
      # the record is the last page visited in the previous session.
      if ( last_sessions_visits[-1].title == record['visits'][0].title ):
        updated_visit = Visit(
          last_sessions_visits[-1].id, # visitor_id 
          last_sessions_visits[-1].date, # date 
          last_sessions_visits[-1].user, # user 
          last_sessions_visits[-1].title, # title
          last_sessions_visits[-1].slug, # slug
          last_sessions_visits[-1].sessionStart, # sessionStart 
          {
            **last_sessions_visits[-1].scrollEvents,
            **record['visits'][0].scrollEvents
          }, # scrollEvents
          (
            # The total time on the updated page is the last scroll
            # event on the record's first visit minus the first 
            # scroll event of the last visit of the session to 
            # update.
            datetime.datetime.strptime(
              list( 
                record['visits'][0].scrollEvents.keys()
              )[-1],
              '%Y-%m-%dT%H:%M:%S.%fZ'
            ) - datetime.datetime.strptime(
              list(
                last_sessions_visits[-1].scrollEvents.keys()
              )[0],
              '%Y-%m-%dT%H:%M:%S.%fZ'
            )
          ).total_seconds(), #timeOnPage 
          last_sessions_visits[-1].prevTitle, # prevTitle
          last_sessions_visits[-1].prevSlug, # prevSlug
          record['visits'][0].nextTitle, # nextTitle
          record['visits'][0].nextSlug # nextSlug
        )
        visits_to_update = [ updated_visit ] + record['visits'][1:] + \
          last_sessions_visits[:-1]
      else:
        visits_to_update = record['visits'] + last_sessions_visits
      # Update all of the visits in the record to have the session
      dynamo_client.updateVisits( visits_to_update )
      dynamo_client.addBrowsers( record['browsers'] ) 
      dynamo_client.updateSession(
        Session( 
          last_session.sessionStart, # Start date-time
          last_session.id, # Visitor ID
          np.mean( [
            visit.timeOnPage for visit in visits_to_update
          ] ), # avgTime
          np.sum( [
            visit.timeOnPage for visit in visits_to_update
          ] ) # totalTime
        ),
        []
      )
      updated += 1
    # Add a the new session, visits, and browsers when the last session was
    # more than 30 minutes from this record.
    else: 
      dynamo_client.addSession( record['session'] )
      dynamo_client.addVisits( record['visits'] )
      dynamo_client.addBrowsers( record['browsers'] ) 
      additional += 1
  
  return {
    'statusCode': 200,
    'body': json.dumps(f'updated { updated }\nnew { new }\nadditional {additional}')
  }
Exemplo n.º 24
0
def test_default_init():
  visitor = Visitor( visitor_id )
  assert visitor.id == visitor_id
  assert visitor.numberSessions == 0
Exemplo n.º 25
0
def test_numberSessions_init():
    visitor = Visitor('0.0.0.0', 1)
    assert visitor.ip == '0.0.0.0'
    assert visitor.numberSessions == 1