def read_main_df(): # Reading main data into memory df = read_sv(return_as=pd.DataFrame, path=find_path('up.csv'), encoding='utf-8', delimiter='\t', header=True) assert 'FixationOOB' in df.columns # Mapping FixationOOB strings to numpy booleans OOBstatus = {'True': True, 'False': False} df['FixationOOB'] = df['FixationOOB'].replace(OOBstatus) convert_type(df, Timestamp='int64', FixationDuration='int64', MappedFixationPointX='int64', MappedFixationPointY='int64', StimuliName='object') return df
def look_up_index(self, data, content, equal=True, greater=False, smaller=False): """ Use binary search to get the data we want :return: a list of row ids or [may_exist, offset, data_size] """ # first, we locate the content's position lines = data.splitlines() length = len(lines) low = int(0) high = int(length - 1) middle = int((high - low) / 2 + low) while low <= high: middle = int((high - low) / 2 + low) val_loc = lines[middle].split(self.long_sep) attr_value = val_loc[0] # Integer, Real, Text, Date, Boolean attr_value = utils.convert_type(attr_value, self.attr_type) if attr_value == content: break # get the target block if attr_value < content: low = middle + 1 else: high = middle - 1 if low <= high: # which means we "catch it" in this level # if compare == '<=' or compare == '>=' or compare == "=" or compare == ">": if equal or greater: # When compare is >, we cannot make sure the result, need further validation may_exist = True info = lines[middle].split(self.long_sep)[1] elif smaller: if middle == 0: return [False, 0, 0] else: # In fact, this is a special situation, we can determine which block to use directly may_exist = True info = lines[middle - 1].split(self.long_sep)[1] else: raise Exception("This situation should be resolved earlier") offset = info.split(self.short_sep)[0] data_length = info.split(self.short_sep)[1] return [may_exist, offset, data_length] else: # Can't find the value in this level, but maybe we can roll down # if compare == '=' or compare == '<=' or compare == '>=' or compare == '>': if equal or greater: may_exist = True elif smaller: if high == -1: may_exist = False else: may_exist = True else: raise Exception("This situation should be resolved earlier") info = lines[high].split(self.long_sep)[1] offset = info.split(self.short_sep)[0] data_length = info.split(self.short_sep)[1] return [may_exist, offset, data_length]
def read_main_df(): # Reading main data into memory df = read_sv(return_as=pd.DataFrame, path=find_path('up.csv'), encoding='utf-8', delimiter='\t', header=True) assert 'FixationOOB' in df.columns for i, row in df.iterrows(): df.at[i, 'FixationOOB'] = eval(df.at[i, 'FixationOOB']) convert_type(df, Timestamp='int64', FixationDuration='int64', MappedFixationPointX='int64', MappedFixationPointY='int64', StimuliName='str', FixationOOB='?') print('Read df as pd.DataFrame') return df
def create_level_index(self, level, input_list=None): """ Generate the index file, level 0 for meta file, level 1 for primary index Others are sparse indexes """ index_filename = utils.convert_filename(self.csv_file, FILE_TYPE_INDEX, attr=self.attr, level=level, create_dir=True) if level in self.index_dict: raise Exception("Found used level, check your code") self.index_dict[level] = index_filename if level == 0: pass elif level == 1: # create primary index df = pd.read_csv(self.csv_file, encoding='utf-8') tmp_list = list(df[self.attr]) column_list = [] for i, item in enumerate(tmp_list): column_list.append([utils.convert_type(item,self.attr_type), i]) sorted_column = sorted(column_list) return self.save_index(index_filename, sorted_column) else: return self.save_index(index_filename, input_list, True)
def look_up_data(self, data, content, equal=True, greater=False, smaller=False): lines = data.splitlines() length = len(lines) low = int(0) high = int(length - 1) middle = int((high - low) / 2 + low) while low <= high: middle = int((high - low) / 2 + low) val_loc = lines[middle].split(self.long_sep) attr_value = val_loc[0] # Integer, Real, Text, Date, Boolean attr_value = utils.convert_type(attr_value, self.attr_type) if attr_value == content: break # get the target block if attr_value < content: low = middle + 1 else: high = middle - 1 result_list = [] if equal and smaller: if low <= high: # catch it for i in range(0, middle + 1): info = lines[i].split(self.long_sep)[1] result_list.append(info.split(self.short_sep)) return result_list else: if high == -1: return [] else: for i in range(0, high + 1): info = lines[i].split(self.long_sep)[1] result_list.append(info.split(self.short_sep)) return result_list elif equal and greater: if low <= high: for i in range(middle, len(lines)): info = lines[i].split(self.long_sep)[1] result_list.append(info.split(self.short_sep)) return result_list else: if low == len(lines): return [] for i in range(low, len(lines)): info = lines[i].split(self.long_sep)[1] result_list.append(info.split(self.short_sep)) return True, result_list elif equal: if low <= high: info = lines[middle].split(self.long_sep)[1] return info.split(self.short_sep) else: return [] elif smaller: if low <= high: if middle == 0: return [] for i in range(0, middle): info = lines[i].split(self.long_sep)[1] result_list.append(info.split(self.short_sep)) return result_list else: if high == -1: return [] else: for i in range(0, high + 1): info = lines[i].split(self.long_sep)[1] result_list.append(info.split(self.short_sep)) return result_list elif greater: if low <= high: if middle == len(lines) - 1: return [] for i in range(middle + 1, len(lines)): info = lines[i].split(self.long_sep)[1] result_list.append(info.split(self.short_sep)) return result_list else: if low == len(lines): return [] for i in range(low, len(lines)): info = lines[i].split(self.long_sep)[1] result_list.append(info.split(self.short_sep)) return result_list else: raise Exception("This situation should be resolved earlier")