def readARFF(filename): featureSet = IdSet(1) classSet = IdSet(0) f = open(filename,"rt") inData = False lines = f.readlines() counter = ProgressCounter(len(lines),"ARFFLine") examples = [] for line in lines: counter.update(string="Processing line " + str(counter.current + 1) + ": ") line = line.strip() if len(line) == 0 or line[0] == "%": continue elif line[0] == "@": #print line category = line.split()[0].lower() if category == "@attribute": category, name, type = line.split() assert(not inData) if name.lower() == "class": name = name.lower() classNames = type[1:-1].split(",") assert(len(classNames)==2) classSet.defineId(classNames[0].strip(),1) classSet.defineId(classNames[1].strip(),-1) featureSet.getId(name) elif category.lower() == "@relation": assert(not inData) elif category == "@data": inData = True else: assert(inData) count = 1 features = {} for column in line.split(","): if featureSet.getName(count) != "class": features[count] = float(column) else: classId = classSet.getId(column, False) assert(classId != None) count += 1 exampleCount = str(len(examples)) exampleId = "BreastCancer.d" + exampleCount + ".s0.x0" examples.append([exampleId,classId,features,{}]) return examples
def getClassSet(rows, classSet=None): from Core.IdSet import IdSet classNames = set() for row in rows: classNames.add(row["class"]) classNames.add(row["prediction"]) # In the case of multiclass, give integer id:s for the classes if classSet == None: classSet = IdSet() assert(not ("1" in classNames and "neg" in classNames)) assert("1" in classNames or "neg" in classNames) if "1" in classNames: classSet.defineId("1",1) else: classSet.defineId("neg",1) for i in sorted(list(classNames)): if i != "1" and i != "neg": classSet.getId(i) return classSet