class ProjectVectorBuilder(): projects = {} def __init__(self, project_data): self.project_data = project_data self.nb = NaiveBayesClassifier(rm.TRAINDATA_VOCAB, rm.TRAINDATA_DATASET) self.nb.train() def build_projects_vector(self): print "In build projects" for name, project in self.project_data.iteritems(): readme = project['readme'] # Bad case: When readme is not found. It returns empty lists. if isinstance(readme, list): readme = "" else: readme = unicode(readme, 'utf-8', errors = 'ignore') if project['description'] != None: readme += project['description'] if readme == "": continue self.projects[name] = {} prob_data = self.nb.classify(readme)[0] self.projects[name]['class_prob'] = prob_data self.projects[name]['description'] = project['description'] if len(prob_data) > 0: self.projects[name]['category'] = max(prob_data.iteritems(), key=operator.itemgetter(1))[0] self.projects[name]['prob'] = max(prob_data.iteritems(), key=operator.itemgetter(1))[1] return self.projects
def k_fold(self, k): assert k > 1 print('Starting ' + str(k) + '-fold cross-validation.') input('Press Enter to continue...') for run in range(0, k): print('Run ' + str(run+1)) nb = NBC() testing_data = [] training_data = [] for idx, d in enumerate(self.__data): subset_size = int(len(d)/k) testing_data.append(d[run*subset_size:(run+1)*subset_size]) training_data.append(d[:run*subset_size] + d[(run+1)*subset_size:]) nb.train(training_data, self.__label) hits = 0 misses = 0 for idx, item in enumerate(self.__label): for doc in testing_data[idx]: if nb.predict(doc) == item: hits += 1 else: misses += 1 total_length = 0 for item in testing_data: total_length += len(item) self.__precision.append((hits/total_length, misses/total_length)) if self.__verbose: nb.info(self.__level) input('Press Enter to continue...') total = 0 for item in self.__precision: print('Precision: %.2d' % (item[0]*100)) total += item[0] total /= len(self.__precision) print('Average precision: %.2d' % (total*100))
def __init__(self, project_data): self.project_data = project_data self.nb = NaiveBayesClassifier(rm.TRAINDATA_VOCAB, rm.TRAINDATA_DATASET) self.nb.train()