def load_instance(filepath): ins_list=[] filelist = get_ipython().getoutput(u'ls '+filepath) for filename in filelist: f = open(filepath+filename,'r') tokens=[] data=[] #read each line for l in f: tokens += nltk.regexp_tokenize(l,pattern="\w+") data = util.del_dup(tokens) if filepath[-4:-1] == 'neg': ins = Instance(filename,'negative',data,tokens) elif filepath[-4:-1] == 'pos': ins = Instance(filename,'positive',data,tokens) else: raise Exception, "Wrong path!" ins_list.append(ins) f.close() return ins_list
def _collect_counts(self, instance_list): """Collect feature and label counts from the dataset Create appropriate count tables and populate them Replace this with the actual docstring explaining the overview of the function, data structures of choice, etc. """ #pass #for test #self.count_table = numpy.zeros((4,2)) #populate the count_table, for smoothing,initialize the array with value one self.count_table = numpy.ones((self.feature_codebook.size(),self.label_codebook.size())) self.count_y_table = numpy.zeros(self.label_codebook.size()) #counts for i in instance_list: #delete the duplicated words and make it work for the Bernoulli distribution i.raw_data = util.del_dup(i.raw_data) self.count_y_table[self.label_codebook.get_index(i.label)] +=1 for token in i.raw_data: if self.feature_codebook.has_label(token): self.count_table[self.feature_codebook.get_index(token),self.label_codebook.get_index(i.label)]+=1