예제 #1
0
 def process_data(self, input_dir):
     self.queries_data = []
     self.results = {"ole": []}
     all_files, all_fnames = [], []
     for r, _, fs in os.walk(input_dir):
         for f in fs:
             all_files.append(r + "/" + f)
             all_fnames.append(f)
     for i in self.queries:
         tname = "".join(i.split())
         for j in self.locations:
             lname = "".join(j.split())
             fname = get_filename("ole", tname, lname)
             if fname in all_fnames:
                 ind = all_fnames.index(fname)
                 self.queries_data.append({"q": i, "l": j})
                 self.results["ole"].append(get_result_from_csv_file(all_files[ind]))
예제 #2
0
 def format_data(self, input_dir, train_percent=70):
   self.queries_data = []
   self.results = {"ole": []}
   all_files, all_fnames = [], []
   for r, _, fs in os.walk(input_dir):
     for f in fs:
       all_files.append(r+"/"+f)
       all_fnames.append(f)
   for i in self.queries:
     tname = "".join(i.split())
     if tname not in Rank.topic_hash:
       Rank.topic_hash[tname] = Rank.topic_ctr
       Rank.topic_ctr += 1
     for j in self.locations:
       lname = "".join(j.split())
       fname = get_filename("ole", tname, lname)
       #include epsilon data also later
       if fname in all_fnames:
         ind = all_fnames.index(fname)
         self.queries_data.append({"q": i, "l":j})
         self.results["ole"].append(get_fields_from_csv_file(all_files[ind]))
   self.data = []
   indexes = range(len(self.queries_data))
   shuffle(indexes)
   l = len(indexes)
   l_train = l*train_percent/100
   train_indexes = indexes[:l_train]
   test_indexes = indexes[l_train:]
   self.train_data, self.test_data = [] , []
   for i in train_indexes:
     query = self.queries_data[i]
     result = self.results["ole"][i]
     self.train_data.append(Rank.get_data_obj(query, result, i))
   for i in test_indexes:
     query = self.queries_data[i]
     result = self.results["ole"][i]
     self.test_data.append(Rank.get_data_obj(query, result, i))
   write_to_file("train.dat", self.train_data)
   write_to_file("test.dat", self.test_data)