Пример #1
0
    def getTop(num=100, highlight_funcs=[], remove_funcs=[]):
        """
            Get the top [amount] filtered URLS for a case. This has been optimised as one 
            query with a subquery. 
        """        

        filter = getFilter(highlight_funcs, remove_funcs)
        
        url2 = aliased(URL)
        subq = session.query(url2.domain, func.count(url2.domain).label('domain_count'))\
                .join(filter).filter(url2.domain != None)\
                .group_by(url2.domain).order_by(desc(func.count(url2.domain)))

        if num != "all":
            subq = subq.limit(num)           
        subq = subq.subquery()

        q = session.query(URL.netloc, func.count(1), subq.c.domain, subq.c.domain_count)\
            .join((subq, URL.domain == subq.c.domain))\
            .join(filter)\
            .group_by(URL.netloc)\
            .order_by(desc(subq.c.domain_count), asc(subq.c.domain), asc(URL.netloc))

        domains = []
        for netloc, netloc_count, domain, domain_count in q:
            if len(domains) == 0 or domains[-1][0] != domain:
                domains.append((domain, domain_count, []))
                
            domains[-1][2].append((netloc, netloc_count))
          
        return domains 
Пример #2
0
 def getAll(cls):
     """ 
         Return all the objects.
     """
     try:
         q = session.query(cls).order_by(asc(cls.id))
     except AttributeError:
         q = session.query(cls).order_by(asc(cls.entry_id))
     return q
Пример #3
0
    def filesAccessed():
        """
            Returns a file directory tree and the total amount of files accessed. The tree
            is a dictionary of drives, each drive letter is the key and the value is another dictionary 
            with the keys being file types. For each file type, the value is a tree structure with the
            end point being a file name, the number of times the file was accessed and a list of 
            access dates.
                    
            .. note::
                This does not allow for Linux etc drives, only Windows one letter drives.
                
                **ToDo**: re-optimise this query, more things were done making it slow again for 
                larger datasets. 
            
                **ToDo**: Allow other operating systems file systems. 
        """
        q = session.query(URL, func.count(1)).filter(URL.scheme == "file").order_by(func.count(1))
        q = q.group_by(URL.path)
        
        total = q.count()
         
        drives = {}
        
        for file, count in q:
            path = urllib.unquote(file.path[1:])
            
            # hack to get dates working: needs to be made into above query ^
            dates = session.query(Entry.access_date, Entry.access_time).join(URL)\
                .filter(URL.path==file.path)\
                .order_by(desc(Entry.access_date), desc(Entry.access_time))
            
            if path[1] == ":":
                drive = path[0]
            else:
                continue #don't deal with Linux yet
            
            ending = path.rsplit('.',1)
            type = FILE_TYPES.get(ending[-1].strip().lower(),'other file')


            if drive not in drives:           
                drives[drive] = ({}, 0)
            
            if type not in drives[drive][0]:
                drives[drive][0][type] = ({}, 0)
                
            drives[drive] = (drives[drive][0], drives[drive][1] + count)
            drives[drive][0][type] = (drives[drive][0][type][0], drives[drive][0][type][1] + count)
     
            Entry._buildTree(drives[drive][0][type][0], (path, count, dates))
                            
        return drives, total    
Пример #4
0
    def peakTime():  
        """
            Returns the peak time of web browser usage. Similar to the heatmap, although doesn't 
            do for each day of the week - sums up each hour and returns the highest.
        """     
        step = 1
        
        init = datetime(1, 1, 1)
        start_time = init.time()
        until_time = None
        
        highest = 0
        timeperiod = None
        while(start_time < t(24-step,0,0)):
            start_time = init.time()
            until_time = (init + timedelta(hours=step) - timedelta(milliseconds=1)).time()
        
            q = session.query(Entry).filter(Entry.access_time >= start_time)\
                    .filter(Entry.access_time <= until_time).count()

            if q > highest:
                highest = q
                timeperiod = start_time.strftime('%H:%M - ') + until_time.strftime('%H:%M')
                
            init = init + timedelta(hours=step)
        return timeperiod  
Пример #5
0
 def getAllEntries(case):
     """
         Returns all the entries for this case where the access time and date are not `None`.
     """
     q = session.query(Entry).filter(Entry.access_time!=None).filter(Entry.access_date!=None).\
         join('group').filter_by(case_id=case.id)
     return q                        
Пример #6
0
 def setDomain_manual(group):
     """
         Manually set the domain names. Used in the optimised adding of URLs which bypasses 
         creating URL objects using `__init__()`
     """
     q = session.query(URL).join('entry').filter_by(group_id=group.id).all()
     for row in q:
         row.setDomain()
Пример #7
0
 def getEndDate(self):
     """
         Get the ending date of all the group's entries.
     """
     return self._formatDate(session.query(Entry).filter_by(group_id=self.id).\
                             filter(Entry.access_date!=None).filter(Entry.access_time!=None).\
                             order_by(desc(Entry.access_date), desc(Entry.access_time)).\
                             first().access_date)
Пример #8
0
    def onlineSearches(engine, highlight_funcs=[], remove_funcs=[], num=20):
        """
            Given a search engine, amount and filters, returns a word cloud dictionary of search terms.
            The keys are the terms and the values are a (ratio, search phrases term was in, 
            length of search phrases) tuple. Also returns total amount of terms, total number of unique 
            terms and the smallest ratio (used to make the sizes of the words relative to smallest).
        """
        
        filter = getFilter(highlight_funcs, remove_funcs)

        searchterms2 = aliased(SearchTerms)
        subq = session.query(searchterms2.id).filter(searchterms2.engine==engine)\
            .join(searchterms2.entries).join((filter, Entry.id == filter.c.id))\
            .group_by(searchterms2.id)\
            .order_by(desc(searchterms2.occurrence))
        
        if num != "all":
            subq = subq.limit(num)
                
        subq = subq.subquery()
        
        q = session.query(SearchTerms)\
            .join((subq, subq.c.id == SearchTerms.id))\
            .join(SearchTerms.entries)\
            .join((filter, Entry.id == filter.c.id))\
            .join(Entry.parsedurl).filter(SearchTerms.engine==engine)\
            .options(contains_eager('entries'), contains_eager('entries.parsedurl'))\
            .order_by(desc(Entry.access_date), desc(Entry.access_time))            
        
        
        terms = q.all()
        total_terms = float(sum(len(t.entries) for t in terms))        
        unique_terms = len(terms)
        cloud = {}
        smallest = 1.0
        for term in terms:
            search_strings = [(entry.parsedurl.search, entry.timeline_date) for entry in term.entries]
            ratio = len(search_strings)/total_terms
            # Hack to stop javascript name-space clash. TODO: Fix this in nicer way.
            cloud['_' + term.term] = (ratio, search_strings, len(search_strings))
            if ratio < smallest:
                smallest = ratio
        return cloud, total_terms, unique_terms, smallest
Пример #9
0
 def getPercentages():
     """
         Used in the overview statistics. Returns a list of `(browser object, percent)` tuples 
         for each browser where percent is the percentage this browser is used in all the entries.
     """
     q = session.query(Browser.name, func.count(1)).join('entries').group_by(Browser.name).all()
     total = 0.0
     for browser, count in q:
         total = total + count
     return [(browser, count/total) for browser, count in q]
Пример #10
0
 def averagePages():
     """
         Returns the average number of websites visited in one day, i.e. only http
         and http entries. 
     """
     q = session.query(Entry.access_date, func.count(1)).join('parsedurl')\
         .filter(or_(URL.scheme=="http", URL.scheme=="https")).group_by(Entry.access_date)
     total = q.count()
     sums = [row1 for row0, row1 in q]
     if total != 0:
         return int(sum(sums)/float(total))
     else:
         return 0  
Пример #11
0
    def getTimeGraph(startdate, enddate, starttime, endtime, remove_funcs, highlight_funcs,
               remove_duplicates=False, duplicate_time=0):
        """
            Given a start and end date and time and filters, returns three lists of entry points:
            `highlighted`, `not_highlighted` and `removed` by calling `filter_queries()`
            Each entry will be in one of those lists for Flot to draw.  
        """
        
        # make the queries
        # ----------------
        q = session.query(Entry.access_date, Entry.access_time, Entry.url, Browser.name,
                          Browser.version, Browser.source, Group.program, Entry.title)
        
        # join onto all the other tables
        q = q.join('browser').join('parsedurl').join('group').outerjoin(Entry.search_terms)
        
        # filter by the start and end dates and times
        q = q.filter(Entry.access_date >= startdate).filter(Entry.access_date <= enddate)
        
        q_removed, q_highlighted, q_not_highlighted = Case.filter_queries(q, remove_funcs, highlight_funcs)

        removed = (q_removed.order_by(asc(Entry.access_date), asc(Entry.access_time)), [])
        highlighted = (q_highlighted.order_by(asc(Entry.access_date), asc(Entry.access_time)), []) 
        not_highlighted = (q_not_highlighted.order_by(asc(Entry.access_date), asc(Entry.access_time)), [])

        # put the results in the format Flot wants it
        # -------------------------------------------
        for (q, entries) in [removed, highlighted, not_highlighted]:
            seen = set()
            for cols in q:   
                plot = get_plotable(*cols)

                if remove_duplicates == True: # remove duplicates                
                    if plot not in seen:
                        if len(entries) == 0:
                            entries.append(plot)   # add first entry regardless
                            seen.add(plot)
                        else:
                            if (entries[-1][0] == plot[0] and \
                            (plot[1] - entries[-1][1]) >= float(duplicate_time)/float(60)) \
                            or (entries[-1][0] < plot[0]): 
                                # only add entries <duplicate_time> minutes apart
                                entries.append(plot)
                                seen.add(plot)
                else: # else add everything
                    entries.append(plot)
                
        return highlighted[1], not_highlighted[1], removed[1]
Пример #12
0
 def testfilter_queries(self):
     date = datetime(2010, 5, 1)
     q = session.query(Entry.url).join('browser').join('parsedurl').\
         join('group').outerjoin(Entry.search_terms).\
         filter(Entry.access_date==date)
     rem = FilterQuery()   
     rem.add_element('URL Parts','domain','Is','bbc.co.uk', None) 
     remove_funcs = [rem]
     high = FilterQuery()
     high.add_element('URL Parts','domain','Is','google.com',None) 
     highlight_funcs = [high]
     qrem, qhigh, qnot = Case.filter_queries(q, remove_funcs, highlight_funcs) 
     
     self.assertEqual(qrem.count(), 4)
     self.assertEqual(qhigh.count(), 10)
     self.assertEqual(qnot.count(), 15)  
Пример #13
0
def getFilter(highlight_funcs=[], remove_funcs=[]):
    """
        Get the right filter. This is used to filter domain names and search terms correctly.
        `Case.filter_queries` is called, and if `highlight_funcs` != [] then return
        the highlighted query, otherwise return the non-highlighted query. 
    """
    filter_q = session.query(Entry.id)\
                .join(Entry.parsedurl, Entry.browser, Entry.group)\
                .outerjoin(Entry.search_terms)                    

    q_removed, q_highlighted, q_not_highlighted = \
        Case.filter_queries(filter_q, remove_funcs, highlight_funcs)        
        
    if highlight_funcs:
        filter_q = q_highlighted 
    else:
        filter_q = q_not_highlighted
        
    return filter_q.group_by(Entry.id).subquery()
Пример #14
0
    def generateHeatMap():
        """
            Returns heatmap things for the overview: the table headers, the heatmap
            table and the highest and lowest values (used to calculate heatmap colour).
        """
        headers = ['Mon', 'Tue','Wed','Thu','Fri','Sat','Sun']
        rows = []
        step = 1 # number of hours each row in table represents
        
        init = datetime(1, 1, 1)
        start_time = init.time()
        until_time = None
        
        highest = 0
        lowest = 0
        
        while(start_time < t((24-step),0,0)):
            start_time = init.time()
            until_time = (init + timedelta(hours=step) - timedelta(milliseconds=1)).time()
            
            title = start_time.strftime('%H:%M - ') + until_time.strftime('%H:%M')
            
            values = []
            for weekday in [1,2,3,4,5,6,0]: # '%w' gives weekdays starting on sunday
                q = session.query(Entry)\
                    .filter(func.strftime('%w', Entry.access_date) == str(weekday))\
                    .filter(Entry.access_time >= start_time)\
                    .filter(Entry.access_time <= until_time).count()

                if q > highest:
                    highest = q
                if q < lowest:
                    lowest = q
                
                values.append(q)
                
            rows.append((title, values))
            init = init + timedelta(hours=step)

        return headers, rows, highest, lowest   
Пример #15
0
 def getFilterBy(cls, **vars):
     """
         Return a list of objects which satisfy the conditions in **vars
         E.g. `Case.getFilterBy(name="Foo")`
     """
     return session.query(cls).filter_by(**vars)
Пример #16
0
 def getAmount(cls):
     """
         Return the number of objects.
     """
     return session.query(cls).count()
Пример #17
0
 def get_case():
     """
         Get the current case.
     """
     return session.query(Case).first()
Пример #18
0
 def get(cls, id):
     """
         Return the object with the given id.
     """
     return session.query(cls).get(id)
Пример #19
0
 def getNumEntries(self):
     """
         Get amount of entries for this group.
     """
     return session.query(Entry).filter_by(group_id=self.id).count()