def load_test_data(filename, subreddits): instances = [] # each Subreddit's index in the FeatureVector is it's position in subreddits[] # iterate through file again with open(filename) as reader: for line in reader: if len(line.strip()) == 0: continue split_line = line.split(",") label = ClassificationLabel(split_line[0]) split_line.pop(0) feature_vector = FeatureVector() for subreddit in split_line: # sometimes there is an extraneous "\n" if "\n" in subreddit: subreddit = subreddit.replace("\n", "") if subreddit in subreddits: feature = subreddits.index(subreddit) feature_vector.add(feature, 1) instance = Instance(feature_vector, label) instances.append(instance) return instances
def predict(self, instance): max_k, max_val = -1, -float('inf') for k in range(1, max(self.all_labels) + 1): val = self.compute_dot_product( instance._feature_vector.feature_vector, k) if val > max_val: max_k, max_val = k, val elif val == max_val and k < max_k: max_k = k return ClassificationLabel(max_k)
def load_data(filename): instances = [] global indexmax indexmax = 1 with open(filename) as reader: for line in reader: split_line = line.split(" ") for item in split_line[1:]: index = int(item.split(":")[0]) if index > indexmax: indexmax = index indexmax = indexmax + 1 if pastindexmax != -1: indexmax = pastindexmax with open(filename) as reader: for line in reader: if len(line.strip()) == 0: continue split_line = line.split(" ") label_string = split_line[0] int_label = -1 try: int_label = int(label_string) except ValueError: raise ValueError("Unable to convert " + label_string + " to integer.") label = ClassificationLabel(int_label) feature_vector = FeatureVector(indexmax) for item in split_line[1:]: try: index = int(item.split(":")[0]) except ValueError: raise ValueError("Unable to convert index " + item.split(":")[0] + " to integer.") try: value = float(item.split(":")[1]) except ValueError: raise ValueError("Unable to convert value " + item.split(":")[1] + " to float.") if value != 0.0: if index <= indexmax: feature_vector.add(index, value) instance = Instance(feature_vector, label) instances.append(instance) return instances
def load_data(filename): """Function for loading the features from a file into instances""" instances = [] with open(filename) as reader: # print('adding to the instances') global max_size max_size = 0 global max_max_index max_max_index = 0 for line in reader: if len(line.strip()) == 0: continue # Divide the line into features and label. split_line = line.split(" ") label_string = split_line[0] int_label = -1 try: int_label = int(label_string) except ValueError: raise ValueError("Unable to convert " + label_string + " to integer.") label = ClassificationLabel(int_label) feature_vector = FeatureVector() for item in split_line[1:]: try: index = int(item.split(":")[0]) except ValueError: raise ValueError("Unable to convert index " + item.split(":")[0] + " to integer.") try: value = float(item.split(":")[1]) except ValueError: raise ValueError("Unable to convert value " + item.split(":")[1] + " to float.") if value != 0.0: feature_vector.add(index, value) instance = Instance(feature_vector, label) instances.append(instance) # if feature_vector._size > max_size: # max_size = feature_vector._size if feature_vector._max_index > max_max_index: max_max_index = feature_vector._max_index # print('finished adding') return instances
def load_instances(self): filename = "output/word_frequencies.txt" with open(filename) as reader: for line in reader: split_line = line.split(" ") label = ClassificationLabel(split_line[0]) split_line.pop(0) fv = FeatureVector() for word in split_line: if word != "\n": num = word.split(":") fv.add(int(num[0]), int(num[1])) instance = Instance(fv, label) self.instances.append(instance) return self.instances
def load_more_data(filename): subreddits = [] instances = [] # figure out what all the possible Subreddits are with open(filename) as reader: for line in reader: if len(line.strip()) == 0: continue split_line = line.split(",") split_line.pop(0) for subreddit in split_line: # sometimes there is an extraneous "\n" if "\n" in subreddit: subreddit = subreddit.replace("\n", "") if subreddit not in subreddits: subreddits.append(subreddit) # each Subreddit's index in the FeatureVector is it's position in self.subreddits[] # iterate through file again counter = 0 with open(filename) as reader: for line in reader: if len(line.strip()) == 0: continue split_line = line.split(",") label = ClassificationLabel(split_line[0]) split_line.pop(0) feature_vector = FeatureVector() for subreddit in split_line: # sometimes there is an extraneous "\n" if "\n" in subreddit: subreddit = subreddit.replace("\n", "") feature = subreddits.index(subreddit) feature_vector.add(feature, 1) instance = Instance(feature_vector, label) instances.append(instance) counter += 1 #if counter % 100 == 0: #print(counter) return (instances, subreddits)
def load_data(filename): instances = [] #added highest_idx = 0 with open(filename) as reader: for line in reader: if len(line.strip()) == 0: continue # Divide the line into features and label. split_line = line.split(" ") label_string = split_line[0] int_label = -1 try: int_label = int(label_string) except ValueError: raise ValueError("Unable to convert " + label_string + " to integer.") label = ClassificationLabel(int_label) feature_vector = FeatureVector() for item in split_line[1:]: try: index = int(item.split(":")[0]) #added if (index > highest_idx): highest_idx = index except ValueError: raise ValueError("Unable to convert index " + item.split(":")[0] + " to integer.") try: value = float(item.split(":")[1]) except ValueError: raise ValueError("Unable to convert value " + item.split(":")[1] + " to float.") if value != 0.0: feature_vector.add(index, value) instance = Instance(feature_vector, label) instances.append(instance) #added return instances, highest_idx
def create_instances(self): counter = 0.0 length = len(self.descriptions) filename = "output/word_frequencies.txt" fo = open(filename, "wb") for d in self.descriptions: stripped = self.clean_text(d['description']) label = ClassificationLabel(int(counter)) fv = FeatureVector() for word in stripped: feature = self.corpus.index(word) fv.add(feature, int(fv.get(feature) + 1)) instance = Instance(fv, label) self.instances.append(instance) line = str(label) + " " for f in fv.get_keys(): line += str(f) + ":" + str(fv.get(f)) + " " fo.write(line.encode('utf8') + "\n") counter += 1 self.update_progress(float(counter / length)) fo.close()
def load_data(filename): """Function for loading the features from a file into instances""" count = 0 # count2 = 0 # REMOVE instances = [] with open(filename) as reader: for line in reader: if len(line.strip()) == 0: continue # Divide the line into features and label. split_line = line.split(" ") label_string = split_line[0] int_label = -1 try: int_label = int(label_string) except ValueError: raise ValueError("Unable to convert " + label_string + " to integer.") label = ClassificationLabel(int_label) feature_vector = FeatureVector() # while (counter < 10): # REMOVE # print("%s" % str(label)) # REMOVE # counter += 1 for item in split_line[1:]: try: index = int(item.split(":")[0]) except ValueError: raise ValueError("Unable to convert index " + item.split(":")[0] + " to integer.") try: value = float(item.split(":")[1]) except ValueError: raise ValueError("Unable to convert value " + item.split(":")[1] + " to float.") if value != 0.0: # if (count2 < 10): # REMOVE # print("index = %f" % index) # REMOVE # print("value = %f" % value) # count2 += 1 feature_vector.add(index, value) # print("num non zero = %d", feature_vector.get_lil_matrix().count_nonzero()) instance = Instance(feature_vector, label) instances.append(instance) # print('label = %d' % label._class) #REMOVE fv2 = coo_matrix(instances[1].get_feature_vector().get_lil_matrix()) # print("row = %d" % fv2.row) # print("col = %d" % fv2.col) # for i, j, v in zip(fv2.row, fv2.col, fv2.data): # if (count < 10): # print ("shit = (%d, %d), %s" % (i,j,v)) # count += 1 # print("num non zero = %d" % fv2.count_nonzero()) return instances
def predict(self, instance): return ClassificationLabel( self.sign(instance._feature_vector.feature_vector))
def predict(self, instance): return ClassificationLabel(1) if self.compute_dot_product( instance) >= 0 else ClassificationLabel(0)