def end_query_session(self,end_time): # DMAX added in this condition to take the first event only. # Subsequent events add overhead to the time - that isn't strictly part of the session. if self.session_end_time is None: # Added by David on December 13, 2016 # Last events (e.g. EXPERIMENT_TIMEOUT) should not be considered as the final session event. # Some people in the experiment walk away (or something) for several minutes, meaning that times are way out. # In this case, we roll back to the last interaction event - where the event is not EXPERIMENT_TIMEOUT or SESSION_COMPLETED. end_time = self.last_interaction_time self.session_end_time = end_time # print "END EVENT: {0}".format(self.last_interaction_event) # print "END TIME: {0}".format(self.last_interaction_time) self.session_time = get_time_diff(self.session_start_time, end_time) #print "session time", self.session_time self.update_times(end_time) #if self.last_event == 'VIEW_SEARCH_RESULTS_PAGE': # self.snippet_time = self.snippet_time + get_time_diff(self.view_serp_time, end_time) # Adding some code to work out probabilities for clicking! relevant_count = 0 for i in range(0, self.hover_depth): if self.hover_depth > len(self.query_response.results): continue if self.qrel_handler.get_value(self.topic, self.query_response.results[i].docid) > 0: relevant_count = relevant_count + 1 for i in range(0, self.hover_depth): docid_at_rank = self.query_response.results[i].docid if is_relevant(self.qrel_handler, self.topic, docid_at_rank) == 0: self.hover_trec_nonrel_count = self.hover_trec_nonrel_count + 1 else: self.hover_trec_rel_count = self.hover_trec_rel_count + 1
def process(self, vals): self.event_count = self.event_count + 1 #self.last_event_time # We want to measure query time from the last QUERY_FOCUS event. # We could do it from the first, but we decided this could be too unreliable... # So every time we see a new QUERY_FOCUS, we override what we have before and update the time accordingly. # Commented out this line so that this is overwritten #if self.last_query_focus_time is None: if ('QUERY_FOCUS' in vals): self.last_query_focus_time = '{date} {time}'.format(date=vals[0],time=vals[1]) if self.last_query_focus_time is None: if ('VIEW_SEARCH_BOX' in vals): self.last_query_focus_time = '{date} {time}'.format(date=vals[0],time=vals[1]) # End de-dentation if ('QUERY_ISSUED' in vals): # new query, create a query log entry if self.current_query: if self.last_query_focus_time: lqft = self.last_query_focus_time else: lqft = self.last_event_time # We didn't see a FOCUS or VIEW_SEARCH_BOX, so fallback to last event time. self.current_query.end_query_session(lqft) #print "QUERY ISSUED:", vals[8:] #print self.last_query_focus_time, ':::', vals[1], ':::', get_time_diff(self.last_query_focus_time, vals[1]) #print if self.last_query_focus_time is None: self.last_query_focus_time = self.last_event_time self.current_query = QueryLogEntry(self.key, vals, self.qrel_handler, self.engine, get_time_diff(self.last_query_focus_time, '{date} {time}'.format(date=vals[0],time=vals[1]))) self.last_query_focus_time = None self.query_ended_previously = False self.queries.append(self.current_query) else: if self.current_query: # process result under this query object self.current_query.process(vals) # probably should put a condition on this (start task, doc viewed, view serp, etc, ) not all/any self.last_event_time = '{date} {time}'.format(date=vals[0],time=vals[1]) event = vals[8] if event in ['PRACTICE_SEARCH_TASK_COMPLETED','SESSION_COMPLETED','EXPERIMENT_TIMEOUT','SNIPPET_POSTTASK_SURVEY_STARTED','SEARCH_TASK_COMPLETED']: #print 'search task complete - event' if self.current_query and not self.query_ended_previously: #print "end of search session" self.current_query.end_query_session('{date} {time}'.format(date=vals[0],time=vals[1])) self.query_ended_previously = True # Code for removing documents that were previously marked, but are then reselected as non-relevant. all_docs_unmarked = [] for query_object in self.queries: all_docs_unmarked = all_docs_unmarked + query_object.doc_unmarked_list query_object.doc_unmarked_list = [] for query_object in self.queries: for docid in all_docs_unmarked: if docid in query_object.doc_marked_list: topic = self.key.split(' ')[4] query_object.doc_marked_list.remove(docid) query_object.doc_rel_count = query_object.doc_rel_count - 1 if is_relevant(self.qrel_handler, topic, docid) == 0: query_object.doc_clicked_trec_nonrel_count = query_object.doc_clicked_trec_nonrel_count - 1 else: query_object.doc_clicked_trec_rel_count = query_object.doc_clicked_trec_rel_count - 1
def process(self, vals): self.event_count = self.event_count + 1 self.curr_event = vals[8] self.update_times('{date} {time}'.format(date=vals[0],time=vals[1])) if 'VIEW_SEARCH_RESULTS_PAGE' in vals: n = 1 if len(vals) == 10: n = int(vals[9]) if self.pages < n: self.pages = n self.curr_page = n if 'DOC_MARKED_VIEWED' in vals: m = int(vals[13]) #n = (self.curr_page - 1)* PAGE_SIZE + m if self.doc_depth < m: self.doc_depth = m self.doc_count = self.doc_count + 1 if is_relevant(self.qrel_handler, vals[7], vals[10]) == 0: self.doc_clicked_trec_nonrel_count = self.doc_clicked_trec_nonrel_count + 1 else: self.doc_clicked_trec_rel_count = self.doc_clicked_trec_rel_count + 1 if 'DOCUMENT_HOVER_IN' in vals: m = int(vals[-1]) #print vals #print m #n = (self.curr_page - 1)* PAGE_SIZE + m self.hover_count += 1 if m > self.hover_depth: self.hover_depth = m if 'DOC_MARKED_RELEVANT' in vals: r = int(vals[12]) if r > 0: self.doc_rel_count = self.doc_rel_count + 1 self.doc_marked_list.append(vals[10]) # add in here a check to determine whether the document was trec relevant. if is_relevant(self.qrel_handler, vals[7], vals[10]) == 0: self.doc_trec_nonrel_count = self.doc_trec_nonrel_count + 1 else: self.doc_trec_rel_count = self.doc_trec_rel_count + 1 m = int(vals[13]) if self.doc_rel_depth < m: self.doc_rel_depth = m if 'DOC_MARKED_NONRELEVANT' in vals: self.doc_unmarked_list.append(vals[10]) self.last_last_event = self.last_event self.last_event = vals[8] self.last_time = '{date} {time}'.format(date=vals[0], time=vals[1]) if (vals[8] not in ['SESSION_COMPLETED', 'EXPERIMENT_TIMEOUT']): self.last_interaction_event = vals[8] self.last_interaction_time = '{date} {time}'.format(date=vals[0], time=vals[1])
def walk_trees(path: PurePath, cold_index: Index, hot_dir: Path, cold_dir: Path, hot_rules: List[PathAwareGitWildMatchPattern], cold_rules: List[PathAwareGitWildMatchPattern], pbar: tqdm) -> List[Change]: """Compare hot (sub)dir, cold (sub)dir, and cold index. Returns a list of changes required to get them synced. It only returns outermost directories/files for each change (except content changes, which it lists all) :param path: current relative path :param cold_index: the original unchanged cold index :param hot_dir: hot base path :param cold_dir: cold base path :param hot_rules: files to ignore in hot dir :param cold_rules: files to ignore in cold dir :param pbar: will be updated as files get hashed :return: All changes between cold and hot directories under current sub path """ sub_index = cold_index[path] if path in cold_index else None if (hot_dir / path).is_file() and (cold_dir / path).is_file(): hot_hash, cold_hash, eq = hash_compare_files(hot_dir / path, cold_dir / path, pbar) if eq: if path not in cold_index: return [AddedCopied(path, 0, hot_hash)] elif cold_index[path] != cold_hash: return [ModifiedCopied(path, 0, hot_hash)] else: return [] else: if path not in cold_index: return [ AddedAppeared(path, (hot_dir / path).stat().st_size, hot_hash) ] elif cold_index[path] != cold_hash: if hot_hash == cold_index[path]: return [ Corrupted(path, os.path.getsize(hot_dir / path), hot_hash) ] else: return [ ModifiedCorrupted(path, os.path.getsize(hot_dir / path), hot_hash) ] else: return [ Modified(path, (hot_dir / path).stat().st_size, hot_hash) ] elif not (hot_dir / path).is_dir() or not (cold_dir / path).is_dir() or isinstance( sub_index, str): raise NotImplementedError("File/Folder name collision") else: changes: List[Change] = [] if (hot_dir / path / '.gitignore').exists(): with open(hot_dir / path / '.gitignore', 'r') as f: hot_rules += map( lambda r: PathAwareGitWildMatchPattern(r, hot_dir / path), f.read().splitlines()) if (cold_dir / path / '.gitignore').exists(): with open(cold_dir / path / '.gitignore', 'r') as f: cold_rules += map( lambda r: PathAwareGitWildMatchPattern(r, cold_dir / path), f.read().splitlines()) if path == PurePath(): hot_rules += [ PathAwareGitWildMatchPattern('index.txt', hot_dir / path) ] cold_rules += [ PathAwareGitWildMatchPattern('index.txt', cold_dir / path) ] hot_children: Set[PurePath] = set( map( lambda abs_path: abs_path.relative_to(hot_dir), filter(lambda p: is_relevant(p, hot_rules), (hot_dir / path).iterdir()))) cold_children: Set[PurePath] = set( map( lambda abs_path: abs_path.relative_to(cold_dir), filter(lambda p: is_relevant(p, cold_rules), (cold_dir / path).iterdir()))) index_children: Set[PurePath] = set( map(lambda p: path / p, sub_index.iterdir() if sub_index is not None else [])) # H C I: 1 0 X for hot_child in hot_children.difference(cold_children): i, size = hash_tree(hot_dir / hot_child, pbar) if hot_child not in cold_index: changes.append(Added(hot_child, size, i)) elif i == cold_index[hot_child]: changes.append(Lost(hot_child, size)) else: changes.append(ModifiedLost(hot_child, size, i)) # H C I: 0 1 X for cold_child in cold_children.difference(hot_children): if cold_child not in cold_index: changes.append(Appeared(cold_child, 0)) for file in walk(cold_dir / cold_child, cold_rules): pbar.update(file.stat().st_size) else: i, size = hash_tree(cold_dir / cold_child, pbar) if i == cold_index[cold_child]: changes.append(Removed(cold_child, 0)) else: changes.append(RemovedCorrupted(cold_child, 0)) # H C I: 0 0 1 for index_child in index_children.difference(hot_children).difference( cold_children): changes.append(RemovedLost(index_child, 0)) # Recursive: (H C I: 1 1 X) for child in hot_children & cold_children: ch_changes = walk_trees(child, cold_index, hot_dir, cold_dir, hot_rules[:], cold_rules[:], pbar) changes.extend(ch_changes) return changes