def run(self): label_file = self.pass_in out_dir = self.pass_out # read in the hard label file # the format will be [(id, label), (id, label), ...] hard_labels = [] for line in open(label_file): line = line.rstrip('\n') uid, label = line.split('\t') hard_labels.append((uid, label)) hard_labels = dpDataset.shuffle(hard_labels) # we output the splited hard-labeled file into the directory round_count = 0 for train, test in dpDataset.kFolds(hard_labels, FOLDS): out_train = os.path.join(out_dir, "train" + str(round_count)) out_test = os.path.join(out_dir, "test" + str(round_count)) fout_train = open(out_train, 'w') for uid, label in train: fout_train.write(uid + '\t' + label + '\n') fout_train.close() fout_test = open(out_test, 'w') for uid, label in test: fout_test.write(uid + '\t' + label + '\n') fout_test.close() round_count += 1
def getSeeds(): fname = '../feature_set2/ver2.8-efollowing.libsvm' dataset = dp_dataset.load(fname) dataset = dp_dataset.shuffle(dataset) e = Effective(dataset) nested_seeds = e.getFeatureList(n_count = 10) def _flatten(l): for el in l: if (isinstance(el, collections.Iterable) and not isinstance(el, basestring)): for sub in _flatten(el): yield sub else: yield el print nested_seeds return list(set([x for x in _flatten(nested_seeds)]))