Пример #1
0
 def end_query_session(self,end_time):
     # DMAX added in this condition to take the first event only.
     # Subsequent events add overhead to the time - that isn't strictly part of the session.
     if self.session_end_time is None:
             
             # Added by David on December 13, 2016
             # Last events (e.g. EXPERIMENT_TIMEOUT) should not be considered as the final session event.
             # Some people in the experiment walk away (or something) for several minutes, meaning that times are way out.
             # In this case, we roll back to the last interaction event - where the event is not EXPERIMENT_TIMEOUT or SESSION_COMPLETED.
             end_time = self.last_interaction_time
             self.session_end_time = end_time
             
             # print "END EVENT: {0}".format(self.last_interaction_event)
             # print "END TIME: {0}".format(self.last_interaction_time)
             
             self.session_time = get_time_diff(self.session_start_time, end_time)
         #print "session time", self.session_time
     
             self.update_times(end_time)
         #if self.last_event == 'VIEW_SEARCH_RESULTS_PAGE':
         #    self.snippet_time = self.snippet_time + get_time_diff(self.view_serp_time, end_time)
     
     # Adding some code to work out probabilities for clicking!
     relevant_count = 0
     
     for i in range(0, self.hover_depth):
         if self.hover_depth > len(self.query_response.results):
             continue
     
         if self.qrel_handler.get_value(self.topic, self.query_response.results[i].docid) > 0:
             relevant_count = relevant_count + 1
         
     for i in range(0, self.hover_depth):
         docid_at_rank = self.query_response.results[i].docid
         
         if is_relevant(self.qrel_handler, self.topic, docid_at_rank) == 0:
             self.hover_trec_nonrel_count = self.hover_trec_nonrel_count + 1
         else:
             self.hover_trec_rel_count = self.hover_trec_rel_count + 1
Пример #2
0
    def process(self, vals):
        self.event_count = self.event_count + 1

        #self.last_event_time
        # We want to measure query time from the last QUERY_FOCUS event.
        # We could do it from the first, but we decided this could be too unreliable...
        # So every time we see a new QUERY_FOCUS, we override what we have before and update the time accordingly.
        
        # Commented out this line so that this is overwritten
        #if self.last_query_focus_time is None:

        if ('QUERY_FOCUS' in vals):
            self.last_query_focus_time = '{date} {time}'.format(date=vals[0],time=vals[1])

        if self.last_query_focus_time is None:
            if ('VIEW_SEARCH_BOX' in vals):
                self.last_query_focus_time = '{date} {time}'.format(date=vals[0],time=vals[1])
        
        # End de-dentation

        
        if ('QUERY_ISSUED' in vals):
            # new query, create a query log entry
            if self.current_query:
                if self.last_query_focus_time:
                    lqft = self.last_query_focus_time
                else:
                    lqft = self.last_event_time  # We didn't see a FOCUS or VIEW_SEARCH_BOX, so fallback to last event time.

                self.current_query.end_query_session(lqft)

            #print "QUERY ISSUED:", vals[8:]
            #print self.last_query_focus_time, ':::', vals[1], ':::', get_time_diff(self.last_query_focus_time, vals[1])
            #print
            if self.last_query_focus_time is None:
                self.last_query_focus_time = self.last_event_time
            
            self.current_query = QueryLogEntry(self.key, vals, self.qrel_handler, self.engine, get_time_diff(self.last_query_focus_time, '{date} {time}'.format(date=vals[0],time=vals[1])))
            self.last_query_focus_time = None
            self.query_ended_previously = False
            self.queries.append(self.current_query)
        else:
            if self.current_query:
                # process result under this query object
                self.current_query.process(vals)
        
        # probably should put a condition on this (start task, doc viewed, view serp, etc, ) not all/any
        self.last_event_time = '{date} {time}'.format(date=vals[0],time=vals[1])

        event = vals[8]
        if event in ['PRACTICE_SEARCH_TASK_COMPLETED','SESSION_COMPLETED','EXPERIMENT_TIMEOUT','SNIPPET_POSTTASK_SURVEY_STARTED','SEARCH_TASK_COMPLETED']:
            #print 'search task complete - event'
            if self.current_query and not self.query_ended_previously:
                #print "end of search session"
                self.current_query.end_query_session('{date} {time}'.format(date=vals[0],time=vals[1]))
                self.query_ended_previously = True
        
        # Code for removing documents that were previously marked, but are then reselected as non-relevant.
        all_docs_unmarked = []
        
        for query_object in self.queries:
            all_docs_unmarked = all_docs_unmarked + query_object.doc_unmarked_list
            query_object.doc_unmarked_list = []
        
        for query_object in self.queries:
            for docid in all_docs_unmarked:
                if docid in query_object.doc_marked_list:
                    topic = self.key.split(' ')[4]
                    
                    query_object.doc_marked_list.remove(docid)
                    query_object.doc_rel_count = query_object.doc_rel_count - 1

                    if is_relevant(self.qrel_handler, topic, docid) == 0:
                        query_object.doc_clicked_trec_nonrel_count = query_object.doc_clicked_trec_nonrel_count - 1
                    else:
                        query_object.doc_clicked_trec_rel_count = query_object.doc_clicked_trec_rel_count - 1
Пример #3
0
    def process(self, vals):
        self.event_count = self.event_count + 1
        self.curr_event = vals[8]
        self.update_times('{date} {time}'.format(date=vals[0],time=vals[1]))
        
        if 'VIEW_SEARCH_RESULTS_PAGE' in vals:
            n = 1
            if len(vals) == 10:
                n = int(vals[9])
            
            if self.pages < n:
                self.pages = n
            self.curr_page = n

        if 'DOC_MARKED_VIEWED' in vals:
            m = int(vals[13])
            #n = (self.curr_page - 1)* PAGE_SIZE + m
            if self.doc_depth < m:
                self.doc_depth = m

            self.doc_count = self.doc_count + 1
            
            if is_relevant(self.qrel_handler, vals[7], vals[10]) == 0:
                self.doc_clicked_trec_nonrel_count = self.doc_clicked_trec_nonrel_count + 1
            else:
                self.doc_clicked_trec_rel_count = self.doc_clicked_trec_rel_count + 1
        
        if 'DOCUMENT_HOVER_IN' in vals:
            m = int(vals[-1])
            #print vals
            #print m
            
            #n = (self.curr_page - 1)* PAGE_SIZE + m
            self.hover_count += 1
            
            if m > self.hover_depth:
                self.hover_depth = m
             
        if 'DOC_MARKED_RELEVANT' in vals:
            r = int(vals[12])
            if r > 0:
                self.doc_rel_count = self.doc_rel_count + 1
                self.doc_marked_list.append(vals[10])
                
                # add in here a check to determine whether the document was trec relevant.
                
                if is_relevant(self.qrel_handler, vals[7], vals[10]) == 0:
                    self.doc_trec_nonrel_count = self.doc_trec_nonrel_count + 1
                else:
                    self.doc_trec_rel_count = self.doc_trec_rel_count + 1
                
                m = int(vals[13])
                if self.doc_rel_depth < m:
                    self.doc_rel_depth = m
        
        if 'DOC_MARKED_NONRELEVANT' in vals:
            self.doc_unmarked_list.append(vals[10])
                    
        self.last_last_event = self.last_event
        self.last_event = vals[8]
        self.last_time = '{date} {time}'.format(date=vals[0], time=vals[1])
        
        if (vals[8] not in ['SESSION_COMPLETED', 'EXPERIMENT_TIMEOUT']):
            self.last_interaction_event = vals[8]
            self.last_interaction_time = '{date} {time}'.format(date=vals[0], time=vals[1])
Пример #4
0
def walk_trees(path: PurePath, cold_index: Index, hot_dir: Path,
               cold_dir: Path, hot_rules: List[PathAwareGitWildMatchPattern],
               cold_rules: List[PathAwareGitWildMatchPattern],
               pbar: tqdm) -> List[Change]:
    """Compare hot (sub)dir, cold (sub)dir, and cold index. Returns a list of changes required to get them synced.

    It only returns outermost directories/files for each change (except content changes, which it lists all)

    :param path: current relative path
    :param cold_index: the original unchanged cold index
    :param hot_dir: hot base path
    :param cold_dir: cold base path
    :param hot_rules: files to ignore in hot dir
    :param cold_rules: files to ignore in cold dir
    :param pbar: will be updated as files get hashed
    :return: All changes between cold and hot directories under current sub path
    """
    sub_index = cold_index[path] if path in cold_index else None

    if (hot_dir / path).is_file() and (cold_dir / path).is_file():
        hot_hash, cold_hash, eq = hash_compare_files(hot_dir / path,
                                                     cold_dir / path, pbar)
        if eq:
            if path not in cold_index:
                return [AddedCopied(path, 0, hot_hash)]
            elif cold_index[path] != cold_hash:
                return [ModifiedCopied(path, 0, hot_hash)]
            else:
                return []
        else:
            if path not in cold_index:
                return [
                    AddedAppeared(path, (hot_dir / path).stat().st_size,
                                  hot_hash)
                ]
            elif cold_index[path] != cold_hash:
                if hot_hash == cold_index[path]:
                    return [
                        Corrupted(path, os.path.getsize(hot_dir / path),
                                  hot_hash)
                    ]
                else:
                    return [
                        ModifiedCorrupted(path,
                                          os.path.getsize(hot_dir / path),
                                          hot_hash)
                    ]
            else:
                return [
                    Modified(path, (hot_dir / path).stat().st_size, hot_hash)
                ]

    elif not (hot_dir / path).is_dir() or not (cold_dir /
                                               path).is_dir() or isinstance(
                                                   sub_index, str):
        raise NotImplementedError("File/Folder name collision")

    else:
        changes: List[Change] = []

        if (hot_dir / path / '.gitignore').exists():
            with open(hot_dir / path / '.gitignore', 'r') as f:
                hot_rules += map(
                    lambda r: PathAwareGitWildMatchPattern(r, hot_dir / path),
                    f.read().splitlines())
        if (cold_dir / path / '.gitignore').exists():
            with open(cold_dir / path / '.gitignore', 'r') as f:
                cold_rules += map(
                    lambda r: PathAwareGitWildMatchPattern(r, cold_dir / path),
                    f.read().splitlines())
        if path == PurePath():
            hot_rules += [
                PathAwareGitWildMatchPattern('index.txt', hot_dir / path)
            ]
            cold_rules += [
                PathAwareGitWildMatchPattern('index.txt', cold_dir / path)
            ]

        hot_children: Set[PurePath] = set(
            map(
                lambda abs_path: abs_path.relative_to(hot_dir),
                filter(lambda p: is_relevant(p, hot_rules),
                       (hot_dir / path).iterdir())))
        cold_children: Set[PurePath] = set(
            map(
                lambda abs_path: abs_path.relative_to(cold_dir),
                filter(lambda p: is_relevant(p, cold_rules),
                       (cold_dir / path).iterdir())))
        index_children: Set[PurePath] = set(
            map(lambda p: path / p,
                sub_index.iterdir() if sub_index is not None else []))

        # H C I: 1 0 X
        for hot_child in hot_children.difference(cold_children):
            i, size = hash_tree(hot_dir / hot_child, pbar)
            if hot_child not in cold_index:
                changes.append(Added(hot_child, size, i))
            elif i == cold_index[hot_child]:
                changes.append(Lost(hot_child, size))
            else:
                changes.append(ModifiedLost(hot_child, size, i))

        # H C I: 0 1 X
        for cold_child in cold_children.difference(hot_children):
            if cold_child not in cold_index:
                changes.append(Appeared(cold_child, 0))
                for file in walk(cold_dir / cold_child, cold_rules):
                    pbar.update(file.stat().st_size)
            else:
                i, size = hash_tree(cold_dir / cold_child, pbar)
                if i == cold_index[cold_child]:
                    changes.append(Removed(cold_child, 0))
                else:
                    changes.append(RemovedCorrupted(cold_child, 0))

        # H C I: 0 0 1
        for index_child in index_children.difference(hot_children).difference(
                cold_children):
            changes.append(RemovedLost(index_child, 0))

        # Recursive: (H C I: 1 1 X)
        for child in hot_children & cold_children:
            ch_changes = walk_trees(child, cold_index, hot_dir, cold_dir,
                                    hot_rules[:], cold_rules[:], pbar)
            changes.extend(ch_changes)

        return changes