示例#1
0
def test_itemToSession():
    session = Session(datetime.datetime(2020, 1, 1, 0, 0), '0.0.0.0', 0.1, 0.1)
    newSession = itemToSession(session.toItem())
    assert newSession.sessionStart == session.sessionStart
    assert newSession.ip == session.ip
    assert newSession.avgTime == session.avgTime
    assert newSession.totalTime == session.totalTime
示例#2
0
def test_toItem():
    session = Session(datetime.datetime(2020, 1, 1, 0, 0), '0.0.0.0', 0.1, 0.1)
    assert session.toItem() == {
        'PK': {
            'S': 'VISITOR#0.0.0.0'
        },
        'SK': {
            'S': 'SESSION#2020-01-01T00:00:00.000Z'
        },
        'GSI2PK': {
            'S': 'SESSION#0.0.0.0#2020-01-01T00:00:00.000Z'
        },
        'GSI2SK': {
            'S': '#SESSION'
        },
        'Type': {
            'S': 'session'
        },
        'AverageTime': {
            'N': '0.1'
        },
        'TotalTime': {
            'N': '0.1'
        }
    }
示例#3
0
def test_itemToSession():
    session = Session(session_start, visitor_id, avg_time, total_time)
    newSession = itemToSession(session.toItem())
    assert newSession.sessionStart == session.sessionStart
    assert newSession.id == session.id
    assert newSession.avgTime == session.avgTime
    assert newSession.totalTime == session.totalTime
示例#4
0
def test_toItem():
    session = Session(session_start, visitor_id, avg_time, total_time)
    assert session.toItem() == {
        'PK': {
            'S': f'VISITOR#{ visitor_id }'
        },
        'SK': {
            'S': f'SESSION#{ session_start }'
        },
        'GSI2PK': {
            'S': f'SESSION#{ visitor_id }#{ session_start }'
        },
        'GSI2SK': {
            'S': '#SESSION'
        },
        'Type': {
            'S': 'session'
        },
        'AverageTime': {
            'N': str(avg_time)
        },
        'TotalTime': {
            'N': str(total_time)
        }
    }
示例#5
0
def test_key():
    session = Session(session_start, visitor_id, avg_time, total_time)
    assert session.key() == {
        'PK': {
            'S': f'VISITOR#{ visitor_id }'
        },
        'SK': {
            'S': f'SESSION#{ session_start }'
        }
    }
示例#6
0
def test_gsi2():
    session = Session(datetime.datetime(2020, 1, 1, 0, 0), '0.0.0.0', 0.1, 0.1)
    assert session.gsi2() == {
        'GSI2PK': {
            'S': 'SESSION#0.0.0.0#2020-01-01T00:00:00.000Z'
        },
        'GSI2SK': {
            'S': '#SESSION'
        }
    }
示例#7
0
def test_gsi2():
    session = Session(session_start, visitor_id, avg_time, total_time)
    assert session.gsi2() == {
        'GSI2PK': {
            'S': f'SESSION#{ visitor_id }#{ session_start }'
        },
        'GSI2SK': {
            'S': '#SESSION'
        }
    }
示例#8
0
def test_key():
    session = Session(datetime.datetime(2020, 1, 1, 0, 0), '0.0.0.0', 0.1, 0.1)
    assert session.key() == {
        'PK': {
            'S': 'VISITOR#0.0.0.0'
        },
        'SK': {
            'S': 'SESSION#2020-01-01T00:00:00.000Z'
        }
    }
示例#9
0
    def updateSession(self, session, visits, print_error=True):
        '''Updates a session with new visits and attributes.

    Parameters
    ----------
    session : Session
      The session to change the average time on page and the total time on the
      website.
    visits : list[ Visit ]
      All of the visits that belong to the session.
    '''
        if not isinstance(session, Session):
            raise ValueError('Must pass a Session object')
        if not isinstance(visits, list):
            raise ValueError('Must pass a list of Visit objects')
        if not all([isinstance(visit, Visit) for visit in visits]):
            raise ValueError('List of visits must be of Visit type')
        # Get all of the seconds per page visit that exist.
        pageTimes = [
            visit.timeOnPage for visit in visits
            if isinstance(visit.timeOnPage, float)
        ]
        # Calculate the average time the visitor spent on the pages. When there are
        # no page times, there is no average time.
        if len(pageTimes) == 1:
            averageTime = pageTimes[0]
        elif len(pageTimes) > 1:
            averageTime = np.mean(pageTimes)
        else:
            averageTime = None
        # Calculate the total time spent in this session. When there is only one
        # visit, there is no total time.
        if len(visits) == 1:
            totalTime = None
        else:
            totalTime = (visits[-1].date - visits[0].date).total_seconds()
        session = Session(visits[0].date, visits[0].ip, averageTime, totalTime)
        try:
            self.client.put_item(TableName=self.tableName,
                                 Item=session.toItem(),
                                 ConditionExpression='attribute_exists(PK)')
            self.addVisits(visits)
            return {'session': session, 'visits': visits}
        except ClientError as e:
            if print_error:
                print(f'ERROR updateSession: { e }')
            if e.response['Error'][
                    'Code'] == 'ConditionalCheckFailedException':
                return {'error': f'Session not in table { session }'}
            return {'error': 'Could not update session in table'}
示例#10
0
def test_default_init():
    session = Session(session_start, visitor_id, avg_time, total_time)
    assert session.sessionStart == datetime.datetime.strptime(
        session_start, '%Y-%m-%dT%H:%M:%S.%fZ')
    assert session.id == visitor_id
    assert session.avgTime == avg_time
    assert session.totalTime == total_time
示例#11
0
    def addNewSession(self, visitor, browsers, visits):
        '''Adds a new session to the table for the given visitor.

    Parameters
    ----------
    visitor : Visitor
      The returning visitor. They will have their number of sessions
      incremented.
    browsers : list[ Browser ]
      The visitor's browsers to be added to the table.
    visits: list[ Visit ]
      The visits to be added to the table.

    Returns
    -------
    result : dict
      The result of adding a new session for a visitor. This could be either
      the error that occurs or the updated visitor, the browsers added, and the
      visits added to the table.
    '''
        result = self.incrementVisitorSessions(visitor)
        if 'error' in result.keys():
            return {'error': result['error']}
        visitor = result['visitor']
        result = self.addBrowsers(browsers)
        if 'error' in result.keys():
            return {'error': result['error']}
        # Get all of the seconds per page visit that exist.
        pageTimes = [
            visit.timeOnPage for visit in visits
            if isinstance(visit.timeOnPage, float)
        ]
        # Calculate the average time the visitor spent on the pages. When there are
        # no page times, there is no average time.
        if len(pageTimes) == 1:
            averageTime = pageTimes[0]
        elif len(pageTimes) > 1:
            averageTime = np.mean(pageTimes)
        else:
            averageTime = None
        # Calculate the total time spent in this session. When there is only one
        # visit, there is no total time.
        if len(visits) == 1:
            totalTime = None
        else:
            totalTime = (visits[-1].date - visits[0].date).total_seconds()
        session = Session(visits[0].date, visits[0].ip, averageTime, totalTime)
        result = self.addSession(session)
        if 'error' in result.keys():
            return {'error': result['error']}
        result = self.addVisits(visits)
        if 'error' in result.keys():
            return {'error': result['error']}
        return {
            'visitor': visitor,
            'browsers': browsers,
            'visits': visits,
            'session': session
        }
示例#12
0
def processDF( key, s3_client ):
  '''Reads a raw csv file S3 and parses the browsers, visits, and sessions.

  Parameters
  ----------
  key : str
    The key of the '.parquet' file in the S3 bucket.
  s3_client : S3Client
    The S3 client used to get the '.parquet' file from.

  Returns
  -------
  result : dict
    The browsers, visits, and sessions parsed from the file.
  '''
  request = s3_client.getObject( key )
  # Read the parquet file as a pandas DF
  df = pd.read_csv(
    io.BytesIO( request['Body'].read() ),
    sep = ',\t', engine = 'python',
    names = [
      'process', 'id', 'time', 'title', 'slug', 'userAgent', 'width',
      'height', 'x', 'y'
    ],
    usecols = [
      'id', 'time', 'title', 'slug', 'userAgent', 'width', 'height', 'x', 'y'
    ],
    index_col = 'time'
  )
  df = df.drop_duplicates().sort_index()
  index_change = df.ne(
    df.shift()
  ).apply( lambda x: x.index[x].tolist() ).title
  indexes = [
    ( index_change[index], index_change[index + 1] - 1 )
      if index != len( index_change ) - 1
    else (index_change[index], df.tail(1).index[0])
    for index in  range( len( index_change ) )
  ]
  visits = []
  for ( start, stop ) in indexes:
    temp = df.loc[ start: stop ]
    visits.append(
      Visit(
        temp.id.unique()[0],
        formatEpoch( temp.iloc[[0]].index[0] ),
        '0',
        temp.title.unique()[0],
        temp.slug.unique()[0],
        formatEpoch( temp.iloc[[0]].index[0] ),
        {
          formatEpoch( index ): { 'x': row.x, 'y': row.y }
          for index, row in temp.iterrows()
        },
        ( temp.iloc[[-1]].index[0] - temp.iloc[[0]].index[0] ) / 1000
      )
    )
  for visit in visits:
    visit.sessionStart=visits[0].date
  for index in range( 1, len( visits ) ):
    visits[index - 1].nextTitle = visits[index].title
    visits[index - 1].nextSlug = visits[index].slug
  for index in range( len( visits ) - 1 ):
    visits[index + 1].prevTitle = visits[index].title
    visits[index + 1].prevSlug = visits[index].slug
  session = Session(
    visits[0].sessionStart,
    df.id.unique()[0],
    np.mean( [ visit.timeOnPage for visit in visits ] ),
    np.sum( [ visit.timeOnPage for visit in visits ] )
  )
  browsers = [
    Browser(
      df.id.unique()[0],
      row.userAgent,
      row.width,
      row.height,
      formatEpoch(
        df.loc[
          ( df['height'] == row.height ) & ( df['width'] == row.width )
        ].head(1).index[0]
      )
    )
    for index, row in df.groupby(
      ['userAgent','height','width']
    ).size().reset_index().rename(
      columns={0:'count'}
    ).iterrows()
  ]
  return{ 'visits': visits, 'session': session, 'browsers': browsers }
示例#13
0
def test_pk():
    session = Session(datetime.datetime(2020, 1, 1, 0, 0), '0.0.0.0', 0.1, 0.1)
    assert session.pk() == {'S': 'VISITOR#0.0.0.0'}
示例#14
0
def test_gsi2pk():
    session = Session(session_start, visitor_id, avg_time, total_time)
    assert session.gsi2pk() == {
        'S': f'SESSION#{ visitor_id }#{ session_start }'
    }
示例#15
0
def test_default_init():
    session = Session('2020-01-01T00:00:00.000Z', '0.0.0.0', 0.1, 0.1)
    assert session.sessionStart == datetime.datetime(2020, 1, 1, 0, 0)
    assert session.ip == '0.0.0.0'
    assert session.avgTime == 0.1
    assert session.totalTime == 0.1
示例#16
0
def test_pk():
    session = Session(session_start, visitor_id, avg_time, total_time)
    assert session.pk() == {'S': f'VISITOR#{ visitor_id }'}
示例#17
0
def test_repr():
    session = Session(datetime.datetime(2020, 1, 1, 0, 0), '0.0.0.0', 0.1, 0.1)
    assert repr(session) == '0.0.0.0 - 0.1'
示例#18
0
def test_gsi2pk():
    session = Session(datetime.datetime(2020, 1, 1, 0, 0), '0.0.0.0', 0.1, 0.1)
    assert session.gsi2pk() == {
        'S': 'SESSION#0.0.0.0#2020-01-01T00:00:00.000Z'
    }
示例#19
0
def session():
  return Session( session_start, visitor_id, avg_time, total_time )
示例#20
0
def s3_processor(event, context):
  """[summary]

  Args:
      event ([type]): [description]
      context ([type]): [description]

  Returns:
      [type]: [description]
  """  
  new = 0
  updated = 0
  additional = 0
  # Get the necessary data from the S3 event.
  key = urllib.parse.unquote_plus(
    event['Records'][0]['s3']['object']['key'], encoding='utf-8'
  )
  aws_region = event['Records'][0]['awsRegion']
  bucket_name = event['Records'][0]['s3']['bucket']['name']
  # Create the necessary clients
  dynamo_client = DynamoClient( os.environ['TABLE_NAME'], aws_region )
  s3_client = S3Client( bucket_name, aws_region )
  # Parse the record to get the browsers, visits, and session.
  record = processDF( key, s3_client )
  # Get the visitor from the table
  visitor_details = dynamo_client.getVisitorDetails( 
    Visitor( record['session'].id ) 
  )
  # Add the visitor, visits, session, and browsers if the visitor is not in 
  # the table.
  if not 'visitor' in visitor_details:
    dynamo_client.addVisitor( Visitor( record['session'].id ) )
    dynamo_client.addSession( record['session'] )
    dynamo_client.addVisits( record['visits'] )
    dynamo_client.addBrowsers( record['browsers'] ) 
    new += 1
  # Check to see if the last session can be combined with the one in this
  # record.
  else:
    last_session = visitor_details['sessions'][-1]
    last_sessions_visits = [ 
      visit for visit in visitor_details['visits'] 
      if visit.sessionStart == last_session.sessionStart
    ]
    # Combine the visits and update the session when the last session was
    # less than 30 minutes from this record,
    if (
      (
        last_sessions_visits[-1].date - record['visits'][0].date
      ).total_seconds() < 60 * 30
    ):
      # Update all of the record's with the previous session start
      for visit in record['visits']:
        visit.sessionStart = last_session.sessionStart
      # Update the last visit of the last session when the first visit of
      # the record is the last page visited in the previous session.
      if ( last_sessions_visits[-1].title == record['visits'][0].title ):
        updated_visit = Visit(
          last_sessions_visits[-1].id, # visitor_id 
          last_sessions_visits[-1].date, # date 
          last_sessions_visits[-1].user, # user 
          last_sessions_visits[-1].title, # title
          last_sessions_visits[-1].slug, # slug
          last_sessions_visits[-1].sessionStart, # sessionStart 
          {
            **last_sessions_visits[-1].scrollEvents,
            **record['visits'][0].scrollEvents
          }, # scrollEvents
          (
            # The total time on the updated page is the last scroll
            # event on the record's first visit minus the first 
            # scroll event of the last visit of the session to 
            # update.
            datetime.datetime.strptime(
              list( 
                record['visits'][0].scrollEvents.keys()
              )[-1],
              '%Y-%m-%dT%H:%M:%S.%fZ'
            ) - datetime.datetime.strptime(
              list(
                last_sessions_visits[-1].scrollEvents.keys()
              )[0],
              '%Y-%m-%dT%H:%M:%S.%fZ'
            )
          ).total_seconds(), #timeOnPage 
          last_sessions_visits[-1].prevTitle, # prevTitle
          last_sessions_visits[-1].prevSlug, # prevSlug
          record['visits'][0].nextTitle, # nextTitle
          record['visits'][0].nextSlug # nextSlug
        )
        visits_to_update = [ updated_visit ] + record['visits'][1:] + \
          last_sessions_visits[:-1]
      else:
        visits_to_update = record['visits'] + last_sessions_visits
      # Update all of the visits in the record to have the session
      dynamo_client.updateVisits( visits_to_update )
      dynamo_client.addBrowsers( record['browsers'] ) 
      dynamo_client.updateSession(
        Session( 
          last_session.sessionStart, # Start date-time
          last_session.id, # Visitor ID
          np.mean( [
            visit.timeOnPage for visit in visits_to_update
          ] ), # avgTime
          np.sum( [
            visit.timeOnPage for visit in visits_to_update
          ] ) # totalTime
        ),
        []
      )
      updated += 1
    # Add a the new session, visits, and browsers when the last session was
    # more than 30 minutes from this record.
    else: 
      dynamo_client.addSession( record['session'] )
      dynamo_client.addVisits( record['visits'] )
      dynamo_client.addBrowsers( record['browsers'] ) 
      additional += 1
  
  return {
    'statusCode': 200,
    'body': json.dumps(f'updated { updated }\nnew { new }\nadditional {additional}')
  }
示例#21
0
def year_session():
  '''A proper Session object.'''
  return Session( '2020-01-01T00:00:00.000Z', visitor_id, 60.0, 60.0 )
示例#22
0
def test_datetime_init():
    session = Session(datetime.datetime(2020, 1, 1, 0, 0), '0.0.0.0', 0.1, 0.1)
    assert session.sessionStart == datetime.datetime(2020, 1, 1, 0, 0)
    assert session.ip == '0.0.0.0'
    assert session.avgTime == 0.1
    assert session.totalTime == 0.1
示例#23
0
def session():
    '''A proper Session object.'''
    return Session('2020-01-03T00:00:00.000Z', '0.0.0.0', 60.0, 60.0)
示例#24
0
def test_repr():
    session = Session(session_start, visitor_id, avg_time, total_time)
    assert repr(session) == f'{ visitor_id } - { total_time }'
示例#25
0
def year_session():
    '''A proper Session object.'''
    return Session('2020-01-01T00:00:00.000Z', '0.0.0.1', 60.0, 60.0)
示例#26
0
def processParquet(key, dynamo_client, s3_client):
    '''Adds the data from a '.parquet' file to the DynamoDB table.

  Parameters
  ----------
  key : str
    The key of the '.parquet' file in the S3 bucket.
  dynamo_client : DynamoClient
    The DynamoDB client used to store the transformed data.
  s3_client : S3Client
    The S3 client used to get the '.parquet' file from.
  '''
    try:
        request = s3_client.getObject(key)
        # Read the parquet file as a pandas DF
        df = pd.read_parquet(io.BytesIO(request['Body'].read()))
        # Get the unique IP addresses
        ips = df['ip'].unique()
        # Iterate over the IP addresses to organize the DF's per visitor
        for ip in ips:
            # Get the visitor details from the table.
            visitor_details = dynamo_client.getVisitorDetails(Visitor(ip))
            # Get the browsers and visits of the specific IP address.
            visitor_dict = processDF(df, ip)
            # When the visitor is not found in the database, the visitor, location,
            # browser, session, and visits must be added to the database.
            if 'error' in visitor_details.keys() \
              and visitor_details['error'] == 'Visitor not in table':
                # Add the new visitor and their data to the table
                _createNewVisitor(ip, visitor_dict['browsers'],
                                  visitor_dict['visits'], dynamo_client)
            # Otherwise, determine whether to add a new session, update a visitor's
            # session, or combine multiple sessions.
            else:
                # Skip the session when the session is already in the table.
                if Session(visitor_dict['visits'][0].date, ip, 0, 0).key() in [
                        session.key()
                        for session in visitor_details['sessions']
                ]:
                    continue
                # Calculate the time deltas of the different sessions and the visitor's
                # first visit.
                time_deltas = [
                  (
                    visitor_dict['visits'][0].date - \
                    session.sessionStart + \
                    datetime.timedelta( seconds=session.totalTime ) \
                      if session.totalTime is not None \
                      else visitor_dict['visits'][0].date - session.sessionStart
                  )
                  for session in visitor_details['sessions']
                ]
                # Find all sessions that have the timedelta of less than 30 minutes on
                # the same day.
                sessions_to_update = [
                    visitor_details['sessions'][index]
                    for index in range(len(time_deltas))
                    if time_deltas[index].days < 1 and time_deltas[index].days
                    >= 0 and time_deltas[index].seconds /
                    (60 * 60) < 0.5 and time_deltas[index].seconds > 0
                ]
                # Update the visitor's session when only 1 session is found to be
                # within the timedelta.
                if len(sessions_to_update) == 1:
                    _updateSession(sessions_to_update[0],
                                   visitor_dict['visits'], dynamo_client)
                elif len(sessions_to_update) > 1:
                    _updateSessions(sessions_to_update, visitor_dict['visits'],
                                    dynamo_client)
                # Create a new session when the time between the last session and the
                # first of these visits is greater than 30 minutes.
                else:
                    _addSessionToVisitor(ip, visitor_dict['visits'],
                                         visitor_dict['browsers'],
                                         dynamo_client)
    except Exception as e:
        print(f'ERROR processParquet { e }')
        print(
          f'Error getting object { key } from bucket { s3_client.bucketname }.' + \
            ' Make sure they exist and your bucket is in the same region as ' + \
            'this function.'
        )
        raise e
示例#27
0
    def addNewVisitor(self, visitor, location, browsers, visits):
        '''Adds a new visitor and their details the the table.

    Parameters
    ----------
    visitor : Visitor
      The visitor to be added to the table.
    location : Location
      The visitor's location to be added to the table.
    browsers : list[ Browser ]
      The visitor's browsers to be added to the table.
    visits : list[ Visit ]
      The visits to be added to the table.

    Returns
    -------
    result : dict
      The result of adding the visitor and their attributes to the table.
    '''
        result = self.addVisitor(visitor)
        if 'error' in result.keys():
            return {'error': result['error']}
        result = self.addLocation(location)
        if 'error' in result.keys():
            return {'error': result['error']}
        result = self.addBrowsers(browsers)
        if 'error' in result.keys():
            return {'error': result['error']}
        # Get all of the seconds per page visit that exist.
        pageTimes = [
            visit.timeOnPage for visit in visits
            if isinstance(visit.timeOnPage, float)
        ]
        # Calculate the average time the visitor spent on the pages. When there are
        # no page times, there is no average time.
        if len(pageTimes) == 1:
            averageTime = pageTimes[0]
        elif len(pageTimes) > 1:
            averageTime = np.mean(pageTimes)
        else:
            averageTime = None
        # Calculate the total time spent in this session. When there is only one
        # visit, there is no total time.
        if len(visits) == 1:
            totalTime = None
        else:
            totalTime = (visits[-1].date - visits[0].date).total_seconds()
        session = Session(visits[0].date, visits[0].ip, averageTime, totalTime)
        result = self.addSession(session)
        if 'error' in result.keys():
            return {'error': result['error']}
        result = self.addVisits(visits)
        if 'error' in result.keys():
            return {'error': result['error']}
        return {
            'visitor': visitor,
            'location': location,
            'browsers': browsers,
            'visits': visits,
            'session': session
        }
示例#28
0
def session():
  '''A proper Session object.'''
  return Session( session_start, visitor_id, avg_time, total_time )
示例#29
0
def session():
    return Session('2020-01-01T00:00:00.000Z', '0.0.0.0', 60.0, 60.0)