def main():
    """
    Runs a datagenerator from the command-line. Calls JVM start/stop automatically.
    Use -h to see all options.
    """
    parser = argparse.ArgumentParser(
        description='Executes a data generator from the command-line. Calls JVM start/stop automatically.')
    parser.add_argument("-j", metavar="classpath", dest="classpath", help="additional classpath, jars/directories")
    parser.add_argument("-X", metavar="heap", dest="heap", help="max heap size for jvm, e.g., 512m")
    parser.add_argument("datagenerator", help="data generator classname, e.g., "
                                              + "weka.datagenerators.classifiers.classification.LED24")
    parser.add_argument("option", nargs=argparse.REMAINDER, help="additional data generator options")
    parsed = parser.parse_args()
    jars = []
    if parsed.classpath is not None:
        jars = parsed.classpath.split(os.pathsep)

    jvm.start(jars, max_heap_size=parsed.heap, packages=True)

    logger.debug("Commandline: " + join_options(sys.argv[1:]))

    try:
        generator = DataGenerator(classname=parsed.datagenerator)
        if len(parsed.option) > 0:
            generator.options = parsed.option
        DataGenerator.make_data(generator, parsed.option)
    except Exception, e:
        print(e)
def read_file(file_name):
	tile_set_list = []
	characteristic = []
	jvm.start()
	nmrClass = Classifier(jobject=serialization.read("models/lmt_3sd.model"))
	with open(file_name) as f: # opens file

		# reads in characteristic protein sequence and coverts it to expected chemical shift values
		tile_characteristic = f.readline()
		characteristic = re.findall(r'\b[A-Za-z]{3,4}\b', tile_characteristic)
		characteristic = letters_to_numbers(characteristic)

		for line in f: # reads in NMR Data
			#reads each line and grabs numbers and na data
			#file format "a b c d"
			a, b, c, d = re.findall(r'\b\d+\.\d*\b|\bna\b', line)
			# Dealing with missing data
			if (a == "na"):
				a = -1
			if (b == "na"):
				b = -1
			if (c == "na"):
				c = -1
			if (d == "na"):
				d = -1
			# adds a new Tile to tile_set_list
			if (not (a==-1 and b==-1 and c==-1 and d==-1)):
				tile_set_list.append(Tile(a, b, c, d, nmrClass)) 
	return tile_set_list, characteristic, nmrClass
示例#3
0
 def simpleKMeansTrain(self, dataf, options, mname, temp=True):
     '''
     :param data: -> data to be clustered
     :param options: -> SimpleKMeans options
                   N -> number of clusters
                   A -> Distance function to use (ex: default is "weka.core.EuclideanDistance -R first-last")
                   l -> maximum number of iterations default 500
           num-slots -> number of execution slots, 1 means no parallelism
                   S -> Random number seed (default 10)
           example => ["-N", "10", "-S", "10"]
     :return:
     '''
     try:
         jvm.start(max_heap_size=self.wHeap)
         data = self.loadData(dataf, temp=True)
         clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=options)
         clusterer.build_clusterer(data)
         print clusterer
         # cluster the data
         for inst in data:
             cl = clusterer.cluster_instance(inst)  # 0-based cluster index
             dist = clusterer.distribution_for_instance(inst)  # cluster membership distribution
             print("cluster=" + str(cl) + ", distribution=" + str(dist))
         self.saveModel(clusterer, 'skm', mname)
     except Exception, e:
         print(traceback.format_exc())
示例#4
0
    def runclustermodel(self, model, method, dataf, temp=True):
        anomalies = []
        try:
            jvm.start(max_heap_size=self.wHeap)
            data = self.loadData(dataf, temp)
            cluster = self.loadClusterModel(model, method)
            clusterMembership = []
            print cluster.number_of_clusters
            for inst in data:
                try:
                    cl = cluster.cluster_instance(inst)
                except Exception as inst:
                    logger.error('[%s] : [ERROR] Mismatch model and data attributes',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))

                dist = cluster.distribution_for_instance(inst)
                print ("cluster=" + str(cl) + ", distribution=" + str(dist))
                clusterMembership.append(cl)

            # print data.attribute_by_name('key')
            # print data.num_instances
            # print data.get_instance(3)

            pa = self.calcThreashold(dict(Counter(clusterMembership)), 21)
            if pa == 0:
                logger.warning('[%s] : [WARN] Most instances are computed as anomalies, possible error encountered!',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),)
                print "Most instances are computed as anomalies, possible error encountered!"
            else:
                for a in pa:
                    # print data.get_instance(a).get_value(0)  #todo always set key as first atribute
                    anomalies.append(data.get_instance(a).get_value(0))
                print "Detected using %s anomalies at timestamp(s) %s" % (model, str(anomalies))
        except Exception, e:
            print(traceback.format_exc())
def main():
    """
    Runs a associator from the command-line. Calls JVM start/stop automatically.
    Use -h to see all options.
    """

    parser = argparse.ArgumentParser(
        description='Executes an associator from the command-line. Calls JVM start/stop automatically.')
    parser.add_argument("-j", metavar="classpath", dest="classpath", help="additional classpath, jars/directories")
    parser.add_argument("-X", metavar="heap", dest="heap", help="max heap size for jvm, e.g., 512m")
    parser.add_argument("-t", metavar="train", dest="train", required=True, help="training set file")
    parser.add_argument("associator", help="associator classname, e.g., weka.associations.Apriori")
    parser.add_argument("option", nargs=argparse.REMAINDER, help="additional associator options")
    parsed = parser.parse_args()
    jars = []
    if parsed.classpath is not None:
        jars = parsed.classpath.split(os.pathsep)

    jvm.start(jars, max_heap_size=parsed.heap, packages=True)

    logger.debug("Commandline: " + join_options(sys.argv[1:]))

    try:
        associator = Associator(classname=parsed.associator)
        if len(parsed.option) > 0:
            associator.options = parsed.option
        loader = converters.loader_for_file(parsed.train)
        data = loader.load_file(parsed.train)
        associator.build_associations(data)
        print(str(associator))
    except Exception, e:
        print(e)
def main():
    """
    Runs a clusterer from the command-line. Calls JVM start/stop automatically.
    Use -h to see all options.
    """
    parser = argparse.ArgumentParser(
        description='Performs clustering from the command-line. Calls JVM start/stop automatically.')
    parser.add_argument("-j", metavar="classpath", dest="classpath", help="additional classpath, jars/directories")
    parser.add_argument("-X", metavar="heap", dest="heap", help="max heap size for jvm, e.g., 512m")
    parser.add_argument("-t", metavar="train", dest="train", required=True, help="training set file")
    parser.add_argument("-T", metavar="test", dest="test", help="test set file")
    parser.add_argument("-d", metavar="outmodel", dest="outmodel", help="model output file name")
    parser.add_argument("-l", metavar="inmodel", dest="inmodel", help="model input file name")
    parser.add_argument("-p", metavar="attributes", dest="attributes", help="attribute range")
    parser.add_argument("-x", metavar="num folds", dest="numfolds", help="number of folds")
    parser.add_argument("-s", metavar="seed", dest="seed", help="seed value for randomization")
    parser.add_argument("-c", metavar="class index", dest="classindex", help="1-based class attribute index")
    parser.add_argument("-g", metavar="graph", dest="graph", help="graph output file (if supported)")
    parser.add_argument("clusterer", help="clusterer classname, e.g., weka.clusterers.SimpleKMeans")
    parser.add_argument("option", nargs=argparse.REMAINDER, help="additional clusterer options")
    parsed = parser.parse_args()
    jars = []
    if parsed.classpath is not None:
        jars = parsed.classpath.split(os.pathsep)
    params = []
    if parsed.train is not None:
        params.extend(["-t", parsed.train])
    if parsed.test is not None:
        params.extend(["-T", parsed.test])
    if parsed.outmodel is not None:
        params.extend(["-d", parsed.outmodel])
    if parsed.inmodel is not None:
        params.extend(["-l", parsed.inmodel])
    if parsed.attributes is not None:
        params.extend(["-p", parsed.attributes])
    if parsed.numfolds is not None:
        params.extend(["-x", parsed.numfolds])
    if parsed.seed is not None:
        params.extend(["-s", parsed.seed])
    if parsed.classindex is not None:
        params.extend(["-c", parsed.classindex])
    if parsed.graph is not None:
        params.extend(["-g", parsed.graph])

    jvm.start(jars, max_heap_size=parsed.heap, packages=True)

    logger.debug("Commandline: " + join_options(sys.argv[1:]))

    try:
        clusterer = Clusterer(classname=parsed.clusterer)
        if len(parsed.option) > 0:
            clusterer.options = parsed.option
        print(ClusterEvaluation.evaluate_clusterer(clusterer, params))
    except Exception as e:
        print(e)
    finally:
        jvm.stop()
示例#7
0
def generate_folds(dataset_path, output_folder, n_folds=10, random_state=None):
    """
    Given a dataset df, generate n_folds for it and store them in <output_folder>/<dataset_name>.

    :type dataset_path: str
    :param dataset_path: Path to dataset with .arff file extension (i.e my_dataset.arff)
    :type output_folder: str
    :param output_folder: Path to store both index file with folds and fold files.
    :type n_folds: int
    :param n_folds: Optional - Number of folds to split the dataset into. Defaults to 10.
    :type random_state: int
    :param random_state: Optional - Seed to use in the splitting process. Defaults to None (no seed).
    """

    import warnings
    warnings.filterwarnings('error')

    dataset_name = dataset_path.split('/')[-1].split('.')[0]

    af = load_arff(dataset_path)
    df = load_dataframe(af)

    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    fold_iter = skf.split(df[df.columns[:-1]], df[df.columns[-1]])

    fold_index = dict()

    jvm.start()

    csv_loader = Loader(classname="weka.core.converters.CSVLoader")
    arff_saver = Saver(classname='weka.core.converters.ArffSaver')

    for i, (arg_rest, arg_test) in enumerate(fold_iter):
        fold_index[i] = list(arg_test)

        _temp_path = 'temp_%s_%d.csv' % (dataset_name, i)

        fold_data = df.loc[arg_test]  # type: pd.DataFrame
        fold_data.to_csv(_temp_path, sep=',', index=False)

        java_arff_dataset = csv_loader.load_file(_temp_path)
        java_arff_dataset.relationname = af['relation']
        java_arff_dataset.class_is_last()
        arff_saver.save_file(java_arff_dataset, os.path.join(output_folder, '%s_fold_%d.arff' % (dataset_name, i)))

        os.remove(_temp_path)

    json.dump(
        fold_index, open(os.path.join(output_folder, dataset_name + '.json'), 'w'), indent=2
    )

    jvm.stop()
    warnings.filterwarnings('default')
def run(arff_path, model_out):
    jvm.start()
    loader = Loader(classname = "weka.core.converters.ArffLoader")
    data = loader.load_file(arff_path)
    data.class_is_last()
    cls = Logistic()
    cls.build_classifier(data)
    cls.save_model(model_out)
    coefficients = cls.coefficients
    for coeff in coefficients:
        print str(coeff)

    return coefficients
示例#9
0
文件: ml.py 项目: ChrisCummins/phd
def start(*args, **kwargs):
    """
    Open a weka connection.

    May be called multiple times, but not after calling stop().

    Arguments:

        *args, **kwargs: Any additional arguments to pass to
          jvm.start().
    """
    if MODULE_SUPPORTED:
        jvm.start(*args, **kwargs)
示例#10
0
def assign_classify(file_location, output="classified.out", model="naivebayes.model"):
    data = read_csv_file(file_location)
    jvm.start()
    # load clusters
    obj = serialization.read(model)
    classifier = Classifier(jobject=obj)
    # create file with cluster group
    with open(output, 'w') as cluster_file:
        for index, attrs in enumerate(data):
            inst = Instance.create_instance(attrs[1:])
            pred = classifier.classify_instance(inst)
            print(str(index + 1) + ": label index=" + str(pred))
    jvm.stop()
示例#11
0
def predict(attributes):
    jvm.start()
    file_path = print_to_file(attributes)
    # load the saved model
    objects = serialization.read_all("/Users/hosyvietanh/Desktop/data_mining/trained_model.model")
    classifier = Classifier(jobject=objects[0])
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(file_path)
    data.class_is_last()
    for index, inst in enumerate(data):
        pred = classifier.classify_instance(inst)
        dist = classifier.distribution_for_instance(inst)
        return int(pred)
    jvm.stop()
示例#12
0
    def run(self):
        jvm.start()  # start Java VM for WEKA

        self.show_banner()  # call function to draw the banner on the UI
        self.generate_csv(
        )  # call function to generate the csv file which will contain the url which will be tested
        self.generate_arff(
        )  # call function which will use another class to extract features from the url and save to a .arff file
        self.show_banner()  # call function to draw the banner on the UI
        self.weka_predict(
        )  # call function to predict whether the url is "phishy" or not via the generated .arff file

        jvm.stop()

        return self.prediction  # return the predicted result back to MainHandler
示例#13
0
def main():
    """
    Runs a associator from the command-line. Calls JVM start/stop automatically.
    Use -h to see all options.
    """

    parser = argparse.ArgumentParser(
        description=
        'Executes an associator from the command-line. Calls JVM start/stop automatically.'
    )
    parser.add_argument("-j",
                        metavar="classpath",
                        dest="classpath",
                        help="additional classpath, jars/directories")
    parser.add_argument("-X",
                        metavar="heap",
                        dest="heap",
                        help="max heap size for jvm, e.g., 512m")
    parser.add_argument("-t",
                        metavar="train",
                        dest="train",
                        required=True,
                        help="training set file")
    parser.add_argument(
        "associator",
        help="associator classname, e.g., weka.associations.Apriori")
    parser.add_argument("option",
                        nargs=argparse.REMAINDER,
                        help="additional associator options")
    parsed = parser.parse_args()
    jars = []
    if parsed.classpath is not None:
        jars = parsed.classpath.split(os.pathsep)

    jvm.start(jars, max_heap_size=parsed.heap, packages=True)

    logger.debug("Commandline: " + join_options(sys.argv[1:]))

    try:
        associator = Associator(classname=parsed.associator)
        if len(parsed.option) > 0:
            associator.options = parsed.option
        loader = converters.loader_for_file(parsed.train)
        data = loader.load_file(parsed.train)
        associator.build_associations(data)
        print(str(associator))
    except Exception, e:
        print(e)
示例#14
0
    def __init__(self, classname="weka.classifiers.functions.SMO", options='default'):
        """Constructor.
        
        Parameters
        ----------
        classname : string, optional, default = 'weka.classifiers.functions.SMO'
            Classifier initialized as default.
        options : string, optional, default = 'default'
            Classifier options initialized as default. Use the string 'default' to default options.
        """
        if not jvm.started:
            jvm.start()

        self.classname = Config("ClassName", classname, str)
        self.options = Config("Options", options, str)
        self.reset()
def main():

    try:
        jvm.start()

        loader = Loader(classname="weka.core.converters.CSVLoader")
        data = loader.load_file("./data/adult.csv")

        data.class_is_last()  # set class attribute

        folds = k
        learning_curve(k, data)
    except Exception as e:
        raise e
    finally:
        jvm.stop()
示例#16
0
def query_instance(attributes, model="out.model"):
    """
        get the cluster for defined attributes
        :params attributes: array or list
        :returns: cluster id
    """
    jvm.start()
    # create instance
    inst = Instance(attributes)
    # load model
    obj = serialization.read(model)
    # load cluster and get the cluster_id
    cluster = Clusterer(jobject=obj)
    cluster_id = cluster.cluster_instance(inst)
    jvm.stop()
    return cluster_id
def main():

    try:
        jvm.start()

        loader = Loader(classname="weka.core.converters.CSVLoader")
        data = loader.load_file("./data/adult.csv")  # load training data

        data.class_is_last()  # set class attribute

        NaiveBayes(data)
        DecisionTree(data)
    except Exception as e:
        raise e
    finally:
        jvm.stop()
def weka_algorithm(algorithm, type, minsup, minconf):
    # Weka worker start
    jvm.start()

    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(arr_file)

    associator = Associator(classname=f"weka.associations.{algorithm}",
                            options=["-M", minsup, "-C", minconf, "-N", "100"])
    associator.build_associations(data)

    with open(f"results/{type}_weka_{algorithm}.output", "w") as fw:
        fw.write(associator.__str__())
        fw.close()

    # Weka worker end
    jvm.stop()
def TestClassification(arff, modelInput, results, sampleName):
    # 启动java虚拟机
    jvm.start()
    # 导入分析模型
    objects = serialization.read_all(modelInput)
    clsf = Classifier(jobject=objects[0])
    print(clsf)
    # 导入测试组
    loader = Loader(classname="weka.core.converters.ArffLoader")
    test = loader.load_file(arff)
    test.class_is_first()
    # 分析结果
    resultsFile = open(results, "w")
    if sampleName:
        resultsFile.write("样本编号\t原判断\t预测\t良性概率\t恶性概率\n")
        print("样本编号\t原判断\t预测\t良性概率\t恶性概率")
        sampleNameListFile = open(getAbsPath() + "/temp.txt", "r")
        sampleNameList = []
        for snlf in sampleNameListFile:
            sampleNameList.append(snlf.split("\n")[0])
        sampleNameListFile.close()
    else:
        resultsFile.write("序号\t原判断\t预测\t良性概率\t恶性概率\n")
        print("序号\t原判断\t预测\t良性概率\t恶性概率")
    for index, inst in enumerate(test):
        pred = clsf.classify_instance(inst)
        dist = clsf.distribution_for_instance(inst)
        if sampleName:
            sampleID = sampleNameList[index]
        else:
            sampleID = str(index + 1)
        origin = inst.get_string_value(inst.class_index)
        prediction = inst.class_attribute.value(int(pred))
        sameAsOrigin = "yes" if pred != inst.get_value(
            inst.class_index) else "no"
        NRate = dist.tolist()[0]
        PRate = dist.tolist()[1]
        resultsFile.write(
            "%s\t%s\t%s\t%s\t%s" %
            (sampleID, origin, prediction, str(NRate), str(PRate)) + "\n")
        print("%s\t%s\t%s\t%s\t%s" %
              (sampleID, origin, prediction, str(NRate), str(PRate)))
    resultsFile.close()
    # 退出java虚拟机
    jvm.stop()
    print("检测完成")
示例#20
0
 def __init__(self, index = 0, inference = "ExactInference", ghostAgents = None):
     BustersAgent.__init__(self, index, inference, ghostAgents)
     self.previousDistances = [0,0,0,0]
     jvm.start(max_heap_size="512m")
     self.loader = Loader(classname="weka.core.converters.ArffLoader")
     self.data = self.loader.load_file("data/game_toCluster.arff")
     self.data.delete_last_attribute()
     self.clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "10", "-S", "4", "-I", "500"])
     self.clusterer.build_clusterer(self.data)
     self.inst = ""
     self.data = self.loader.load_file("data/game_toCluster.arff")
     addCluster = Filter(classname="weka.filters.unsupervised.attribute.AddCluster", options=["-W", "weka.clusterers.SimpleKMeans -N 10 -S 4 -I 500", "-I", "last"])
     addCluster.inputformat(self.data)
     filtered = addCluster.filter(self.data)
     self.f = open('data/addCluster.arff', 'w+')
     self.f.write(str(filtered))
     self.clustered_data = self.classifyData('data/addCluster.arff')
def index():
    if request.method == "GET":
        return render_template('bot.html')
    if request.method == "POST":
        # jvm.stop()
        jvm.start()
        f = open("instances.arff", "a")
        args = request.form.to_dict()
        weight_lb = float(args['weight']) * 2.20462
        bmi = (weight_lb / pow(float(args['height']), 2)) * 703
        hypertensive_status = args['hypertensive_status']
        heart_disease_status = args['heart_disease_status']
        if heart_disease_status == "Yes":
            heart_disease_status = '1'
        else:
            heart_disease_status = '0'
        if hypertensive_status == "Yes":
            hypertensive_status = '1'
        else:
            hypertensive_status = '0'

        st = "\n"+args['gender']+","+args['age']+","+hypertensive_status+","+heart_disease_status+","+args['marrital_status'] + \
            ","+args['work_type']+","+args['residence']+"," + \
            args['hypertension']+","+str(bmi)+",'"+args['smoking_status'].lower()+"',?"
        print(st)
        f.write(st)
        f.close()
        objects = serialization.read_all("J48.model")
        loader = Loader(classname="weka.core.converters.ArffLoader")
        csr = Classifier(jobject=objects[0])
        output_results = PredictionOutput(
            classname="weka.classifiers.evaluation.output.prediction.CSV")
        data1 = loader.load_file("instances.arff")
        data1.class_is_last()
        ev2 = Evaluation(data1)
        ev2.test_model(csr, data1, output_results)

        TESTDATA = StringIO("Instance,Actual,Predicted," +
                            output_results.buffer_content())
        df = pd.read_csv(TESTDATA)
        prediction = list(df.Predicted).pop().split(":")[1]
        print(prediction)
        # jvm.stop()
        response = {"status": "200", "prediction": prediction}
        return Response(json.dumps(response, indent=2),
                        mimetype="application/json")
    def functionProcessamento(self, ca1_r, ca1_l, ca2_ca3_r, ca2_ca3_l, sub_r,
                              sub_l, sexo, id):
        jvm.start()
        path = os.path.dirname(os.path.abspath(__file__))
        # TODO: verificar qual o sexo do individuo para carregar o modelo corretamente
        modelo = path + "\\naive_bayes_feminino_novo.model"
        if (sexo == "Male"):
            print("É masculino")
            modelo = path + "\\naive_bayes_feminino_novo.model"
        objects = serialization.read_all(modelo)
        classifier = Classifier(jobject=objects[0])
        loader = Loader(classname="weka.core.converters.ArffLoader")
        arquivo = open(path + "\\novo_individuo.arff", "w")
        conteudo = list()
        conteudo.append("@relation alzheimer \n\n")
        conteudo.append("@attribute doente {SIM, NAO} \n")
        conteudo.append("@attribute ca1_right real \n")
        conteudo.append("@attribute ca1_left real \n")
        conteudo.append("@attribute ca2_ca3_right real\n")
        conteudo.append("@attribute ca2_ca3_left real \n")
        conteudo.append("@attribute subic_right real \n")
        conteudo.append("@attribute subic_left real \n\n")
        conteudo.append("@data \n")
        #aqui passar as variáveis
        conteudo.append("SIM," + str(ca1_r) + "," + str(ca1_l) + "," +
                        str(ca2_ca3_r) + "," + str(ca2_ca3_l) + "," +
                        str(sub_r) + "," + str(sub_l))
        print(conteudo)
        arquivo.writelines(conteudo)
        arquivo.close()

        data = loader.load_file(path + "\\novo_individuo.arff")
        data.class_is_last()
        for index, inst in enumerate(data):
            pred = classifier.classify_instance(inst)
            dist = classifier.distribution_for_instance(inst)
            pc_doenca = round(((pred) * 100), 2)
            pc_saudavel = round(((100 - pc_doenca)), 2)
            print(" Porcentagem de alzheimer=" + str(pc_doenca) +
                  "%, porcentagem saudavel=" + str(pc_saudavel) + "%")
            alzheimer = Alzheimer.objects.get(id=id)
            alzheimer.resultado_ad = pc_doenca
            alzheimer.resultado_cn = pc_saudavel
            alzheimer.status_seg = 2
            alzheimer.save()
        jvm.stop()
def main(args=None):
    """
    Runs a datagenerator from the command-line. Calls JVM start/stop automatically.
    Use -h to see all options.

    :param args: the command-line arguments to use, uses sys.argv if None
    :type args: list
    """

    parser = argparse.ArgumentParser(
        description=
        'Executes a data generator from the command-line. Calls JVM start/stop automatically.'
    )
    parser.add_argument("-j",
                        metavar="classpath",
                        dest="classpath",
                        help="additional classpath, jars/directories")
    parser.add_argument("-X",
                        metavar="heap",
                        dest="heap",
                        help="max heap size for jvm, e.g., 512m")
    parser.add_argument("datagenerator",
                        help="data generator classname, e.g., " +
                        "weka.datagenerators.classifiers.classification.LED24")
    parser.add_argument("option",
                        nargs=argparse.REMAINDER,
                        help="additional data generator options")
    parsed = parser.parse_args(args=args)
    jars = []
    if parsed.classpath is not None:
        jars = parsed.classpath.split(os.pathsep)

    jvm.start(jars, max_heap_size=parsed.heap, packages=True)

    logger.debug("Commandline: " + join_options(sys.argv[1:]))

    try:
        generator = DataGenerator(classname=parsed.datagenerator)
        if len(parsed.option) > 0:
            generator.options = parsed.option
        DataGenerator.make_data(generator, parsed.option)
    except Exception:
        print(traceback.format_exc())
    finally:
        jvm.stop()
def main():
    """
    Runs attribute selection from the command-line. Calls JVM start/stop automatically.
    Use -h to see all options.
    """
    parser = argparse.ArgumentParser(
        description='Performs attribute selection from the command-line. Calls JVM start/stop automatically.')
    parser.add_argument("-j", metavar="classpath", dest="classpath", help="additional classpath, jars/directories")
    parser.add_argument("-X", metavar="heap", dest="heap", help="max heap size for jvm, e.g., 512m")
    parser.add_argument("-i", metavar="input", dest="input", required=True, help="input file")
    parser.add_argument("-c", metavar="class index", dest="classindex", help="1-based class attribute index")
    parser.add_argument("-s", metavar="search", dest="search", help="search method, classname and options")
    parser.add_argument("-x", metavar="num folds", dest="numfolds", help="number of folds")
    parser.add_argument("-n", metavar="seed", dest="seed", help="the seed value for randomization")
    parser.add_argument("evaluator", help="evaluator classname, e.g., weka.attributeSelection.CfsSubsetEval")
    parser.add_argument("option", nargs=argparse.REMAINDER, help="additional evaluator options")
    parsed = parser.parse_args()
    jars = []
    if parsed.classpath is not None:
        jars = parsed.classpath.split(os.pathsep)
    params = []
    if parsed.input is not None:
        params.extend(["-i", parsed.input])
    if parsed.classindex is not None:
        params.extend(["-c", parsed.classindex])
    if parsed.search is not None:
        params.extend(["-s", parsed.search])
    if parsed.numfolds is not None:
        params.extend(["-x", parsed.numfolds])
    if parsed.seed is not None:
        params.extend(["-n", parsed.seed])

    jvm.start(jars, max_heap_size=parsed.heap, packages=True)

    logger.debug("Commandline: " + join_options(sys.argv[1:]))

    try:
        evaluation = ASEvaluation(classname=parsed.evaluator)
        if len(parsed.option) > 0:
            evaluation.options = parsed.option
        print(AttributeSelection.attribute_selection(evaluation, params))
    except Exception as e:
        print(e)
    finally:
        jvm.stop()
示例#25
0
 def dict2arff(self, fileIn, fileOut):
     '''
     :param fileIn: name of csv file
     :param fileOut: name of new arff file
     :return:
     '''
     dataIn = os.path.join(self.dataDir, fileIn)
     dataOut = os.path.join(self.dataDir, fileOut)
     logger.info('[%s] : [INFO] Starting conversion of %s to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), dataIn, dataOut)
     try:
         jvm.start()
         convertCsvtoArff(dataIn, dataOut)
     except Exception as inst:
         pass
     finally:
         logger.error('[%s] : [ERROR] Exception occured while converting to arff with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
         jvm.stop()
     logger.info('[%s] : [INFO] Finished conversion of %s to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), dataIn, dataOut)
示例#26
0
def SimpleLogistic(obj):
    jvm.start(packages=True)

    # TODO: First_trial_classification.arff First_trial_regression.arff 데이터 셋에 피쳐로 Date 있는데 최신걸로 다 수정좀 (new_models 폴더 전체)
    # TODO: obj의 내용을 바탕으로 input_classification.arff을 작성하는 코드 생성하기

    # load model
    cls = Classifier(jobject=serialization.read("new_models/SimpleLogistic.model"))
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file("new_models/input_classification.arff")
    # 분류이기 때문에 예측 데이터 타입이 nominal이라 numpy로 받을 수 없음.

    data.class_is_last()
    for index, inst in enumerate(data):
        audience_class = cls.classify_instance(inst)

    jvm.stop()
    return audience_class + 1  # index가 0부터라 +1
示例#27
0
def predicaoCluster(matricula, curso, tipo_predicao):

    dados = retornarDadosCurso(curso)
    # selecionando as caracteristicas do aluno
    aluno = dados.loc[dados['MATRICULA'] == matricula][:]
    aluno.drop('MATRICULA', axis=1, inplace=True)
    aluno.drop('APROVADO', axis=1, inplace=True)
    aluno.drop('COD_DISCIPLINA', axis=1, inplace=True)
    aluno.drop('SIT_MATRICULA', axis=1, inplace=True)
    aluno = aluno.head(1)

    aluno.to_csv('aluno_temp.csv', index=False)

    from weka.clusterers import Clusterer
    import weka.core.jvm as jvm
    from weka.core.converters import Loader
    import weka.core.serialization as serialization

    jvm.start()

    if curso == 'si':
        if tipo_predicao == 'reprovacao':
            model = serialization.read_all("model/kmeans_si_reprovacao.model")
        elif tipo_predicao == 'evasao':
            model = serialization.read_all("model/kmeans_si_evasao.model")
    elif curso == 'eca':
        if tipo_predicao == 'reprovacao':
            model = serialization.read_all("model/kmeans_eca_reprovacao.model")
        elif tipo_predicao == 'evasao':
            model = serialization.read_all("model/kmeans_eca_evasao.model")
    cluster = Clusterer(jobject=model[0])

    loader = Loader(classname="weka.core.converters.CSVLoader")
    dado_aluno = loader.load_file("aluno_temp.csv")
    for aluno in dado_aluno:
        cluster_aluno_pertence = cluster.cluster_instance(aluno)

    #jvm.stop()

    caracteristica = retornarCaracteristicaCluster(curso, tipo_predicao,
                                                   cluster_aluno_pertence)

    return caracteristica
示例#28
0
def extract_features():
    jvm.start()
    # carrega dados da requisição
    content = request.json
    # define o caminho para o som selecionado
    path = './src/sounds/test/' + content['file']
    # extrai as caracterísiticas e armazena
    features = Features.extract_feature_sound(path)
    # excluir a ultima posição = classe
    features.pop(6)
    # adiciona um valor para ultima posição para evitar incompatibilidade de registros
    features.append(0)
    # armazena os parametros passados na requisicao - learning rate e traning time
    settings = content['settings']
    # armazena o resultado da classificação
    classification = NeuralNetwork.perceptron_classifier(features, settings)
    # armazena todas as caracteristicas extraidas
    all_features = [{
        'title': "Zero Crossing",
        'value': features[0]
    }, {
        'title': "Spectral Centroid",
        'value': features[1]
    }, {
        'title': "Spectral Rolloff",
        'value': features[2]
    }, {
        'title': "Mel Spectrogram",
        'value': features[3]
    }, {
        'title': "MFCC",
        'value': features[4]
    }, {
        'title': "Chroma STFT",
        'value': features[5]
    }]
    jvm.stop()
    # retorna o resultado da classificação e as características extraidas
    return jsonify({
        'result': classification,
        'features': all_features,
        'status': 200
    }), 200
def associateRule(request):

    jvm.start()

    data_dir = os.path.dirname(os.path.abspath(__file__))
    data = converters.load_any_file(data_dir +
                                    "/templates/upload_files/export.csv")
    data.class_is_last()

    associator = Associator(classname="weka.associations.Apriori",
                            options=["-C", "-1", "-I"])
    # associator = Associator(classname="weka.associations.Apriori", options=["-N", "9", "-I"])
    associator.build_associations(data)

    rules = str(associator)

    jvm.stop()

    return HttpResponse(rules)
示例#30
0
def run():
    jvm.start()
    load_csv = Loader("weka.core.converters.CSVLoader")
    data_csv = load_csv.load_file(
        "/Users/imeiliasantoso/web_graduate_project5/register_page/bank-full_input.csv"
    )

    saver = Saver("weka.core.converters.ArffSaver")
    saver.save_file(
        data_csv,
        "/Users/imeiliasantoso/web_graduate_project5/register_page/bank-full_input.arff"
    )

    load_arff = Loader("weka.core.converters.ArffLoader")
    data_arff = load_arff.load_file(
        "/Users/imeiliasantoso/web_graduate_project5/register_page/bank-full_input.arff"
    )
    data_arff.class_is_last()

    cls = Classifier(classname="weka.classifiers.trees.J48",
                     options=["-C", "0.5"])
    cls.build_classifier(data_arff)
    for index, inst in enumerate(data_arff):
        pred = cls.classify_instance(inst)
        dist = cls.distribution_for_instance(inst)
        # save tree prune in txt file

    saveFile = open(
        "/Users/imeiliasantoso/web_graduate_project5/register_page/bank-full_input.txt",
        "w")
    saveFile.write(str(cls))
    # print(cls)
    saveFile.close()

    global j48
    J48_class = Classifier(classname="weka.classifiers.trees.J48",
                           options=["-C", "0.25", "-M", "2"])
    J48_class.build_classifier(data_arff)
    evaluationj48 = Evaluation(data_arff)
    evaluationj48.crossvalidate_model(J48_class, data_arff, 10, Random(100))
    j48 = str(evaluationj48.percent_correct)
    jvm.stop()
    return j48
def detectarSpam(tuitsConDatos, modeloFilename):
    """
    #
    #   @tuitsConDatos : lista de diccionarios status con los indices
    #                    tweetText, tweet_id, favorite_count y retweet_count
    #
    #   @return predicciones : lista de predicciones por cada tuit de input.
    #                           Cada prediccion es un diccionario con los indices
    #                           index, actual, predicted, error y distribution
    #
    """
    predicciones = []
    try:
        jvm.start()
        jvm.start(system_cp=True, packages=True)
        predicciones = detectarSpam_(tuitsConDatos, modeloFilename)

    except Exception, e:
        print(traceback.format_exc())
def e_model_tree():
    # train_data, test_data = b_i_impute_data()
    # train_data.to_csv("./train_data.csv", index=False)
    # test_data.to_csv("./test_data.csv",index=False)

    jvm.start()
    train_data = converters.load_any_file("train_data.csv")
    train_data.class_is_first()

    test_data = converters.load_any_file("test_data.csv")
    test_data.class_is_first()

    print("1")
    cls = Classifier(classname="weka.classifiers.trees.LMT")
    print("2")
    cls.build_classifier(train_data)

    print("3")
    evl = Evaluation(train_data)
    evl.crossvalidate_model(cls, train_data, 5, Random(1))
    print("Train Accuracy:", evl.percent_correct)
    print("Train summary")
    print(evl.summary())
    print("Train class details")
    print(evl.class_details())
    print("Train confusion matrix")
    print(evl.confusion_matrix)
    plcls.plot_roc(evl, class_index=[0, 1], wait=True)
    plt.suptitle("Train ROC Curve", fontsize=20, y=0.95)
    savefig("./plots/e_train_roc_curve.png")

    evl = Evaluation(test_data)
    evl.test_model(cls, test_data)
    print("Test Accuracy:", evl.percent_correct)
    print("Test summary")
    print(evl.summary())
    print(" Testclass details")
    print(evl.class_details())
    print("Testconfusion matrix")
    print(evl.confusion_matrix)
    plcls.plot_roc(evl, class_index=[0, 1], wait=True)
    plt.suptitle("Test ROC Curve", fontsize=20, y=0.95)
    savefig("./plots/e_test_roc_curve.png")
示例#33
0
def main():

    try:
        jvm.start()

        loader = Loader(classname="weka.core.converters.CSVLoader")
        training_data = loader.load_file(
            "./data/adult.csv")  # load training set
        testing_data = loader.load_file(
            "./data/adult_test.csv")  # load test set

        training_data.class_is_last()
        testing_data.class_is_last()

        testNB(training_data, testing_data)
        testDtree(training_data, testing_data)
    except Exception as e:
        raise e
    finally:
        jvm.stop()
def all_feature(file):
    jvm.start(packages=True)
    data = converters.load_any_file(file)
    data.class_is_last()

    search = ASSearch(classname="weka.attributeSelection.Ranker",
                      options=["-T", "-1.7976931348623157E308", "-N", "-1"])
    attsel = AttributeSelection()
    attsel.search(search)

    evaluator = ASEvaluation(
        classname="weka.attributeSelection.ChiSquaredAttributeEval")
    attsel.evaluator(evaluator)
    attsel.select_attributes(data)
    t = attsel.ranked_attributes[:, 0]
    chi = t.astype(int)

    evaluator = ASEvaluation(
        classname="weka.attributeSelection.InfoGainAttributeEval")
    attsel.evaluator(evaluator)
    attsel.select_attributes(data)
    t = attsel.ranked_attributes[:, 0]
    info_gain = t.astype(int)

    evaluator = ASEvaluation(
        classname="weka.attributeSelection.GainRatioAttributeEval")
    attsel.evaluator(evaluator)
    attsel.select_attributes(data)
    t = attsel.ranked_attributes[:, 0]
    gain_ratio = t.astype(int)

    evaluator = ASEvaluation(
        classname="weka.attributeSelection.SymmetricalUncertAttributeEval")
    attsel.evaluator(evaluator)
    attsel.select_attributes(data)
    t = attsel.ranked_attributes[:, 0]
    symmetric_uncertainty = t.astype(int)

    jvm.stop()

    return chi, info_gain, gain_ratio, symmetric_uncertainty
def main():
    """
    Runs a datagenerator from the command-line. Calls JVM start/stop automatically.
    Use -h to see all options.
    """
    parser = argparse.ArgumentParser(
        description=
        'Executes a data generator from the command-line. Calls JVM start/stop automatically.'
    )
    parser.add_argument("-j",
                        metavar="classpath",
                        dest="classpath",
                        help="additional classpath, jars/directories")
    parser.add_argument("-X",
                        metavar="heap",
                        dest="heap",
                        help="max heap size for jvm, e.g., 512m")
    parser.add_argument(
        "datagenerator",
        help=
        "data generator classname, e.g., weka.datagenerators.classifiers.classification.LED24"
    )
    parser.add_argument("option",
                        nargs=argparse.REMAINDER,
                        help="additional data generator options")
    parsed = parser.parse_args()
    jars = []
    if not parsed.classpath is None:
        jars = parsed.classpath.split(os.pathsep)

    jvm.start(jars, max_heap_size=parsed.heap, packages=True)

    logger.debug("Commandline: " + utils.join_options(sys.argv[1:]))

    try:
        generator = DataGenerator(classname=parsed.datagenerator)
        if len(parsed.option) > 0:
            generator.set_options(parsed.option)
        DataGenerator.make_data(generator, parsed.option)
    except Exception, e:
        print(e)
示例#36
0
def main(read_path, write_path, fileformat='png'):

    some_exception = None
    try:
        jvm.start()
        dataset_names = os.listdir(read_path)
        for dataset in dataset_names:
            print(dataset)

            train_data, test_data = read_datasets(os.path.join(read_path, dataset), n_fold=1)

            for inst in test_data:
                train_data.add_instance(inst)

            y = train_data.values(train_data.class_attribute.index)

            fig, ax = plt.subplots(figsize=(1, 1))  # type: (plt.Figure, plt.Axes)

            classes = sorted(np.unique(y))
            xticks = np.arange(len(classes))

            counts = Counter(y)

            ax.bar(xticks, height=[counts[c] for c in classes])

            ax.set_xticks(xticks)
            ax.set_xticklabels(classes)

            plt.axis('off')

            plt.savefig(os.path.join(write_path, '.'.join([dataset, fileformat])), format=fileformat, transparent=True)

            plt.clf()
            plt.close()

    except Exception as e:
        some_exception = e
    finally:
        jvm.stop()
        if some_exception is not None:
            raise some_exception
def TrainingModel(arff, modelOutput, clsfier):
    # 启动java虚拟机
    jvm.start()
    # 导入训练集
    loader = Loader(classname="weka.core.converters.ArffLoader")
    train = loader.load_file(arff)
    train.class_is_first()
    # 使用RandomForest算法进行训练,因为在GUI版本weka中使用多种方式训练后发现此方式TPR与TNR较高
    cls_name = "weka.classifiers." + clsfier
    clsf = Classifier(classname=cls_name)
    clsf.build_classifier(train)
    print(clsf)
    # 建立模型
    fc = FilteredClassifier()
    fc.classifier = clsf
    evl = Evaluation(train)
    evl.crossvalidate_model(fc, train, 10, Random(1))
    print(evl.percent_correct)
    print(evl.summary())
    print(evl.class_details())
    print(evl.matrix())
    # 结果统计
    matrixResults = evl.confusion_matrix
    TN = float(matrixResults[0][0])
    FP = float(matrixResults[0][1])
    FN = float(matrixResults[1][0])
    TP = float(matrixResults[1][1])
    TPR = TP / (TP + FN)
    TNR = TN / (FP + TN)
    PPV = TP / (TP + FP)
    NPV = TN / (TN + FN)
    print("算法: " + clsfier)
    print("敏感度 TPR: " + str(TPR))
    print("特异度 TNR: " + str(TNR))
    print("PPV: " + str(PPV))
    print("NPV: " + str(NPV))
    # 保存模型
    clsf.serialize(modelOutput, header=train)
    # 退出虚拟机
    jvm.stop()
    print("分析模型建立完成")
示例#38
0
def main(argv):
  if len(argv) <= 1:
    print 'op action testfile/batch'
    return
  jvm.start()
  global op
  op = argv[1]
  action = argv[2]
  if action == "train":
    train(getpara(op, 'heaptime'), getpara(op, 'op')+getpara('hash', 'new'), argv[3:])
  if action == "output_model":
    output_model(getpara(op, 'heaptime'), getpara(op, 'op')+getpara('hash', 'new'), argv[3:])
  if action == "test": 
    pred = {}
    real = {}
    objs = getpara(op, 'heaptime')
    for obj in objs:
      pred[obj] = []
      real[obj] = []
    #for i in [101,102,103,104,105,106,108,109,110,111,112,114,115,116,117,118,119]:
    for i in [1,2,3,4,5,6,8,9,10,11,12,14,15,16,17,18,19] + [101,102,103,104,105,106,108,109,110,111,112,114,115,116,117,118,119]:
    #for i in [4,5,8,9,10,12,18,19]:
      test(objs, getpara(op, 'grid'), ['randtime_tpch_%d.csv' % i], pred, real)
    for obj in objs:
      print obj, metric(pred[obj], real[obj]), len(pred[obj])
  if action == "test_manual": 
    pred = {}
    real = {}
    objs = getpara(op, 'heaptime')
    for obj in objs:
      pred[obj] = []
      real[obj] = []
    for i in [1,2,3,4,5,6,8,9,10,11,12,14,15,16,17,18,19]:
    #for i in [4,5,8,9,10,12,18,19]:
      test_manual(objs, getpara(op, 'grid'), ['randtime_tpch_%d.csv' % i], pred, real)
    for obj in objs:
      print obj, metric(pred[obj], real[obj]), len(pred[obj]), min(real[obj]), max(real[obj]), sum(real[obj])/len(real[obj])
  if action == "testsingle": 
    test_single()
  jvm.stop()
示例#39
0
def classify(train, test, name="RF", tuning=False):
    jvm.start()

    if isinstance(train, list) and isinstance(test, list):
        train = weka_instance(train)
        trn_data = converters.load_any_file(train)
        test = weka_instance(test)
        tst_data = converters.load_any_file(test)

    elif os.path.isfile(train) and os.path.isfile(test):
        trn_data = converters.load_any_file(train)
        tst_data = converters.load_any_file(test)

    else:
        trn = csv_as_ndarray(train)
        tst = csv_as_ndarray(test)

        trn_data = converters.ndarray_to_instances(trn, relation="Train")
        tst_data = converters.ndarray_to_instances(tst, relation="Test")

    trn_data.class_is_last()
    tst_data.class_is_last()

    # t = time()
    if tuning:
        opt = tune(train)
    else:
        opt = default_opt
    # print("Time to tune: {} seconds".format(time() - t))

    cls = Classifier(classname=classifiers[name.lower()], options=opt)

    cls.build_classifier(trn_data)

    distr = [cls.distribution_for_instance(inst)[1] for inst in tst_data]
    preds = [cls.classify_instance(inst) for inst in tst_data]

    jvm.stop()

    return preds, distr
示例#40
0
文件: main.py 项目: aronsar/ganabi
def main():
    args = parse()
    jvm.start(max_heap_size=args.max_heap_size)
    #loading data
    print("**************************************************")
    print("*                 LOADING DATA                   *")
    print("**************************************************")

    data_loader = DataLoader(datapath = args.Datapath, 
            target_name = args.target_agent,
            arff_data_path = args.arff_data_path,
            num_games = args.num_games_source,
            )
    #data_loader.load_target_source_data()

    
    print("**************************************************")
    print("*                 TRAINING                       *")
    print("**************************************************")
    model = Classifier(classname="weka.classifiers.trees.REPTree")
    classifier = TwoStageTransfer(arff_data_path = args.arff_data_path,
            savepath = args.savepath,
            target_name = args.target_agent,
            num_target = args.num_games_target,
            num_source = args.num_games_source,
            boosting_iter=args.boosting_iter,
            fold=args.fold,
            max_source_dataset=args.max_source,
            model = model)
    
    classifier.load_data_from_arff()
    classifier.train()

    print("**************************************************")
    print("*                EVALUATING                      *")
    print("**************************************************")
    print("Evaluate for ", args.target_agent)
    classifier.evaluate_model()
    
    jvm.stop()
示例#41
0
    def dbscanTrain(self, dataf, options, mname, temp=True):
        '''
        :param data: -> data to be clustered
        :param options: -> dbscan options
                      E -> epsilon (default = 0.9)
                      M -> minPoints (default = 6)
                      D -> default weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclideanDataObject
                      I -> index (database) used for DBSCAN (default = weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase)
                example => ["-E",  "0.9",  "-M", "6", "-I", "weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase", "-D", "weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclideanDataObject"]
        :return:
        '''

        try:
            jvm.start(max_heap_size=self.wHeap)
            data = self.loadData(dataf, temp)
            clusterDBSCAN = Clusterer(classname="weka.clusterers.DBSCAN", options=options)
            clusterDBSCAN.build_clusterer(data)
            print clusterDBSCAN
            self.saveModel(clusterDBSCAN, 'dbscan', mname)
            # cluster the data
        except Exception, e:
            print(traceback.format_exc())
def save_all_scores_on_test():
    jvm.start()
    for user in user_list:
        user_test_dir = os.listdir("../data/arff_files/" + str(user) + "/test/")
        user_test_dir.sort()
        n = len(user_test_dir)
        c = 0
        for expression_index in range(n):
            print "\n", expression_index, "=>", str(expression_list[expression_index]), ':', str(user_test_dir[expression_index])
            id = str(expression_list[expression_index]) + '_' + str(user)
            target_dir = '../results_test/' + str(expression_list[expression_index]) + '/' + str(user) + '/'
            model_dir = '../models/' + str(expression_list[expression_index]) + '/' + str(user) + '/'
            test_data_file = "../data/arff_files/" + str(user) + "/test/" + str(user_test_dir[expression_index])
            print test_data_file, "=>", model_dir, "all algos", "=>", target_dir, "\n"
            
            loader = Loader(classname="weka.core.converters.ArffLoader")
            test_data = loader.load_file(test_data_file)
            test_data.class_is_last()
            for algo in algo_func_dict.keys():
                print "Algorithm: " + algo.upper()
                #if algo.upper()=="MLP_CLASSIFIER_10":
                #    continue
                model_file = model_dir + algo + ".model"
                print model_file
                
                j_obj = serialization.read(model_file)
                print j_obj
                trained_model = Classifier(jobject=j_obj)
                scores_matrix = get_classifier_score(trained_model, test_data)
                #print scores_matrix[:5]
                out_file = target_dir + algo + "_scores.csv"
                #writing scores to target file
                #scores_matrix = scores_matrix.astype(np.str)
                
                np.savetxt(out_file, scores_matrix, delimiter=",", fmt="%s")
                c = c + 1
                print str(c) + ": Test Scores Saved =>" + str(out_file)
            #pass
    jvm.stop()
示例#43
0
    def runclustermodel(self, model, method, dataf, temp=True):
        anomalies = []
        try:
            jvm.start(max_heap_size=self.wHeap)
            data = self.loadData(dataf, temp)
            cluster = self.loadClusterModel(model, method)
            clusterMembership = []
            print(cluster.number_of_clusters)
            for inst in data:
                try:
                    cl = cluster.cluster_instance(inst)
                except Exception as inst:
                    logger.error('[%s] : [ERROR] Mismatch model and data attributes',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))

                dist = cluster.distribution_for_instance(inst)
                print(("cluster=" + str(cl) + ", distribution=" + str(dist)))
                clusterMembership.append(cl)

            # print data.attribute_by_name('key')
            # print data.num_instances
            # print data.get_instance(3)

            pa = self.calcThreashold(dict(Counter(clusterMembership)), 21)
            if pa == 0:
                logger.warning('[%s] : [WARN] Most instances are computed as anomalies, possible error encountered!',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),)
                print("Most instances are computed as anomalies, possible error encountered!")
            else:
                for a in pa:
                    # print data.get_instance(a).get_value(0)  #todo always set key as first atribute
                    anomalies.append(data.get_instance(a).get_value(0))
                print("Detected using %s anomalies at timestamp(s) %s" % (model, str(anomalies)))
        except Exception as e:
            print((traceback.format_exc()))
        finally:
            jvm.stop()
        return anomalies
示例#44
0
def main():
    jvm.start()
    vote_classifier_train('./data/final/bolean_for_weka.csv', 'boolean_target',
                          True)
    vote_classifier_train('./data/final/bolean_for_weka.csv', 'boolean_target',
                          False)
    j48('./data/final/bolean_for_weka.csv', 'boolean_target', True)
    j48('./data/final/bolean_for_weka.csv', 'boolean_target', False)
    naive_bayse('./data/final/bolean_for_weka.csv', 'boolean_target', True)
    naive_bayse('./data/final/bolean_for_weka.csv', 'boolean_target', False)
    random_tree('./data/final/bolean_for_weka.csv', 'boolean_target', True)
    random_tree('./data/final/bolean_for_weka.csv', 'boolean_target', False)

    vote_classifier_train(
        './data/final/20 Percent Training Set reducedAttacks_data feature selected with normalized data.csv',
        'reduced attacks to 4', True)
    vote_classifier_train(
        './data/final/20 Percent Training Set reducedAttacks_data feature selected with normalized data.csv',
        'reduced attacks to 4', False)
    j48(
        './data/final/20 Percent Training Set reducedAttacks_data feature selected with normalized data.csv',
        'reduced attacks to 4', True)
    j48(
        './data/final/20 Percent Training Set reducedAttacks_data feature selected with normalized data.csv',
        'reduced attacks to 4', False)
    naive_bayse(
        './data/final/20 Percent Training Set reducedAttacks_data feature selected with normalized data.csv',
        'reduced attacks to 4', True)
    naive_bayse(
        './data/final/20 Percent Training Set reducedAttacks_data feature selected with normalized data.csv',
        'reduced attacks to 4', False)
    random_tree(
        './data/final/20 Percent Training Set reducedAttacks_data feature selected with normalized data.csv',
        'reduced attacks to 4', True)
    random_tree(
        './data/final/20 Percent Training Set reducedAttacks_data feature selected with normalized data.csv',
        'reduced attacks to 4', False)
    jvm.stop()
示例#45
0
    def registerInitialState(self, gameState):
        BustersAgent.registerInitialState(self, gameState)
        self.distancer = Distancer(gameState.data.layout, False)

        #Para calcular los valores de la clase en las politicas.
        self.clusters = 8
        self.classes = 4
        self.classCounts = [[0 for i in range(self.classes)]
                            for j in range(self.clusters)]

        self.classIndex = 2
        self.clusterIndex = 3

        self.readInstances()

        #Esto nos servira para guardar las instancias de entrenamiento.
        self.numInstances = 52
        self.numAttributes = 4
        #self.instances = [[" " for i in range(self.numAttributes)] for j in range(self.numInstances)]
        self.ins = [" " for i in range(self.numInstances)]

        #Para usar la libreria debemos usar la maquina virtual de java, JVM
        jvm.start()

        #Creamos el modelo
        loader = Loader(classname="weka.core.converters.ArffLoader")
        data = loader.load_file(
            "/home/dot/Escritorio/Universidad/Machine Learning/practica 2/Outputs/agent_header.arff"
        )

        self.clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans",
                                   options=["-N", str(self.clusters)])
        self.clusterer.build_clusterer(data)

        print(self.clusterer)

        #Aplicamos la politica
        self.politicaMax()
示例#46
0
def detectarSpam(tuitsConDatos,modeloFilename):
    """
    #
    #   @tuitsConDatos : lista de diccionarios status con los indices
    #                    tweetText, tweet_id, favorite_count y retweet_count
    #
    #   @return predicciones : lista de predicciones por cada tuit de input.
    #                           Cada prediccion es un diccionario con los indices
    #                           index, actual, predicted, error y distribution
    #
    """
    bashCommand = "python manage.py runserver;"
    predicciones = []
    try:
        jvm.start()
        jvm.start(system_cp=True, packages=True)
        predicciones = detectarSpam_(tuitsConDatos,modeloFilename)

    except Exception, e:
        print(traceback.format_exc())
        os.kill(os.getpid(), signal.SIGKILL)
        process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
        output, error = process.communicate()
示例#47
0
 def emTrain(self, dataf, options, mname, temp=True):
     '''
     :param data: -> data to be clustered
     :param options: -> EM options
                   I -> number of iterations
                   N -> number of clusters
                   M -> Minimum standard deviation for normal density (default=1.0E-6)
           num-slots -> number of execution slots, 1 means no parallelism
                   S -> random seed (default=100)
             example => ["-I", "1000", "-N", "6", "-X", "10", "-max", "-1", "-ll-cv", "1.0E-6",
                                    "-ll-iter", "1.0E-6", "-M", "1.0E-6", "-num-slots", "1", "-S", "100"]
     :return:
     '''
     try:
         jvm.start(max_heap_size=self.wHeap)
         data = self.loadData(dataf, temp)
         clusterEM = Clusterer(classname="weka.clusterers.EM",
                           options=options)
         clusterEM.build_clusterer(data)
         print clusterEM
         self.saveModel(clusterEM, 'em', mname, )
     except Exception, e:
         print(traceback.format_exc())
def playback_speed_checker(inputFile, dirRef):
    
    TRAINING_ARFF = 'dataset_playback.arff'
    inputRef = ""

    # Start JVM
    jvm.start()
    jvm.start(system_cp=True, packages=True)
    jvm.start(max_heap_size="512m")
    
    # Find reference file
    for file in os.listdir(dirRef):
        if str(file).find(str(os.path.basename(inputFile))) != -1:
            inputRef = os.path.join(dirRef, file)
            break

    # Calculation distance
    (result, distance) = dtw_checker(inputFile, inputRef)

    # Loading data
    loader = Loader(classname="weka.core.converters.ArffLoader")    
    data = loader.load_file(TRAINING_ARFF)
    data.class_is_last()                    # set class attribute

    # Train the classifier
    #cls = Classifier(classname="weka.classifiers.functions.SMO")
    cls = Classifier(classname="weka.classifiers.trees.J48", options = ["-C", "0.3", "-M", "10"])
    cls.build_classifier(data)

    # Classify instance
    speed_instance = Instance.create_instance(numpy.ndarray(distance), classname='weka.core.DenseInstance', weight=1.0)
    speed_instance.dataset = data
    
    # Classify instance
    speed_flag = cls.classify_instance(speed_instance)
    
    if (distance == 0):
        speed_class = 'nominal'
    else:
        if speed_flag == 0: speed_class = 'down_speed'
        if speed_flag == 0: speed_class = 'up_speed'
        
#    print os.path.basename(inputFile) + ' --- ' + speed_class
    
    # Stop JVM
    jvm.stop()    

    print "SPEED IS: " + speed_class

    return speed_class
def riaa_checker(inputFile):
    
    TRAINING_ARFF = 'C:\Users\ASUS\Desktop\IGNASI\SMC\Workspace\dataset_riaa.arff'

    # Start JVM
    jvm.start()
    jvm.start(system_cp=True, packages=True)
    jvm.start(max_heap_size="512m")

    # Calculation of bark bands information
    (absolute_bark, relative_bark, bark_ratios) = compute_bark_spectrum(inputFile)

    # Loading data
    loader = Loader(classname="weka.core.converters.ArffLoader")    
    data = loader.load_file(TRAINING_ARFF)
    data.class_is_last()                    # set class attribute

    # Train the classifier
    cls = Classifier(classname="weka.classifiers.functions.SMO")
    #cls = Classifier(classname="weka.classifiers.trees.J48", options = ["-C", "0.3", "-M", "10"])
    cls.build_classifier(data)

    # Classify instance
    bark_instance = Instance.create_instance(bark_ratios, classname='weka.core.DenseInstance', weight=1.0)
    bark_instance.dataset = data
    
    # Classify instance
    riaa_flag = cls.classify_instance(bark_instance)
    
    if riaa_flag == 0:
        riaa_class = 'riaa_ok'
    else:
        riaa_class = 'riaa_ko'
        
#    print os.path.basename(inputFile) + ' --- ' + riaa_class
    
    # Stop JVM
    jvm.stop()   

    print "RIAA FILTERING?: " + riaa_class

    return riaa_class
def batch_riaa_checking(inputDir):

    # Start JVM
    jvm.start()
    jvm.start(system_cp=True, packages=True)
    jvm.start(max_heap_size="512m")

    riaa_ok = 0
    riaa_ko = 0

    for file in os.listdir(inputDir):
        if file.endswith(".wav"):
            riaa_flag = riaa_checker(os.path.join(inputDir, file))
            if (riaa_flag == 'riaa_ko'): riaa_ko+=1
            if (riaa_flag == 'riaa_ok'): riaa_ok+=1
    
    # Stop JVM
    jvm.stop()      
    
    return (riaa_ko, riaa_ok)
    capabilities = classifier.capabilities
    print(capabilities)

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    data_capabilities = Capabilities.for_instances(iris_data)
    print(data_capabilities)
    print("classifier handles dataset: " + str(capabilities.supports(data_capabilities)))

    # disable/enable
    helper.print_title("Disable/Enable")
    capability = Capability(member="UNARY_ATTRIBUTES")
    capabilities.disable(capability)
    capabilities.min_instances = 10
    print("Removing: " + str(capability))
    print(capabilities)


if __name__ == "__main__":
    try:
        jvm.start()
        main()
    except Exception as e:
        print(traceback.format_exc())
    finally:
        jvm.stop()
示例#52
0
def main():

    jvm.start(class_path=['./python-weka-wrapper.jar', './weka.jar'],max_heap_size="1024m")

    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file("testoneuser.arff")
    print("Number of Attributes: " + str(data.num_attributes()))
    print("Number of Items: " + str(data.num_instances()))

    data.set_class_index(0)

    #print(data)
    #c = Classifier(classname='weka.classifiers.trees.J48', options=['-C', '0.3'])
    #c.build_classifier(data)

    # TODO: Load the data set, and extract the features
    '''

    '''

    '''
    dataset = loadarff(open('testoneuser.arff', 'r'))
    data = dataset[0]
    #print("Data Length: " + len(data))

    v = data['rollon']
    print("Data: " + str(v))
    '''

    # Temporary: Load the data set
    iris = datasets.load_iris()

    # Get the data length
    dataLength = iris.data.shape[0]
    #iris.data
    data = np.random.random((dataLength, 1))
    Zmax, Zmin = data.max(), data.min()
    data = (data - Zmin) / (Zmax - Zmin)
    data *= 6
    data = np.around(data)

    # Add the new column
    data = np.hstack((data, iris.data))
    data = np.hstack((data, np.reshape(iris.target, (-1, 1))))

    total_correct = collections.defaultdict(int)
    total = collections.defaultdict(int)

    # Iterate through the users leaving one out at a time
    for ignoredUserId in range(0, 7):

        #get the training set
        training_data = data[data[:, 0] != ignoredUserId, :]
        training_label = training_data[:, -1]

        classifiers = cls.getAllClassifiers(splitDataOnFeatures(training_data), training_label)

        # Get the test set
        testData = data[data[:, 0] == ignoredUserId, :]
        allTestData = splitDataOnFeatures(testData)
        correct_values = testData[:, -1]

        # Predict the value based on the classifier
        results = predictAll(classifiers, allTestData, correct_values)

        # Find any differences
        for type, result in results.iteritems():
            total_correct[type] = total_correct[type] + result[0]
            total[type] = total[type] + result[1]

    for type, correct in total_correct.iteritems():
        print("Accuracy for " + str(type) + ": " + str(correct/total[type]))
示例#53
0
from weka.core.dataset import Attribute, Instances
import javabridge
import numpy as np

import pdb


import json
import os

from nemoApi import nemoApi, AIParam
from nemoConfig import nemoConfig

# Start JVM on file load
# Required that only ONE jvm exist for all threads
jvm.start(class_path=["mysql-connector-java-5.1.38-bin.jar"])

class WekaWrapper:

	def __init__(self, questionID, algorithm, classifier, parameters, modelParams, optimizer, predict = 0):
		self.questionID = questionID
		self.algorithm = algorithm
		self.classifier = classifier
		self.parameters = parameters
		self.modelParams = modelParams
		self.api = nemoApi()
		self.config = nemoConfig()
		self.optimizer = optimizer
		self.predict = predict
		self.prediction = None
示例#54
0
        """
        self.assertEqual("\\n\\t", str(classes.backquote("\n\t")))
        self.assertEqual("hello\\tworld", str(classes.backquote("hello\tworld")))
        self.assertEqual("\t\n", str(classes.unbackquote("\\t\\n")))
        self.assertEqual("hello\tworld\n", str(classes.unbackquote("hello\\tworld\\n")))

    def test_from_and_to_commandline(self):
        """
        Tests the from_commandline and to_commandline methods.
        """
        cmdline = "weka.classifiers.trees.J48 -C 0.3 -M 4"
        cls = classes.from_commandline(
            cmdline=cmdline, classname="weka.classifiers.Classifier")
        self.assertIsNotNone(cls)
        self.assertEqual(cmdline, cls.to_commandline())


def suite():
    """
    Returns the test suite.
    :return: the test suite
    :rtype: unittest.TestSuite
    """
    return unittest.TestLoader().loadTestsFromTestCase(TestClasses)


if __name__ == '__main__':
    jvm.start(packages=True)   # necessary for setupgenerator
    unittest.TextTestRunner().run(suite())
    jvm.stop()
        cls = classifiers.Classifier(
            classname="weka.classifiers.functions.SMOreg",
            options=["-K", "weka.classifiers.functions.supportVector.RBFKernel"])
        ms.classifier = cls
        self.assertEqual(cls.to_commandline(), ms.classifier.to_commandline(), msg="classifiers differ")

        cls = classifiers.Classifier(classname="weka.classifiers.functions.LinearRegression")
        ms.classifier = cls
        ms.evaluation = ms.tags_evaluation.find("RMSE")
        self.assertEqual("RMSE", str(ms.evaluation), "evaluation differs: " + str(ms.evaluation))

        ms.evaluation = "ACC"
        self.assertEqual("ACC", str(ms.evaluation), "evaluation differs: " + str(ms.evaluation))
        cls = classifiers.Classifier(classname="weka.classifiers.trees.J48")
        ms.classifier = cls


def suite():
    """
    Returns the test suite.
    :return: the test suite
    :rtype: unittest.TestSuite
    """
    return unittest.TestLoader().loadTestsFromTestCase(TestClassifiers)


if __name__ == '__main__':
    jvm.start(packages=True)   # necessary for multisearch
    unittest.TextTestRunner().run(suite())
    jvm.stop()
#!/usr/bin/env python

import weka.core.jvm as jvm
from weka.core.converters import Loader
from weka.classifiers import Classifier, Evaluation

jvm.logger.setLevel(jvm.logging.WARNING)
jvm.start(packages=True, max_heap_size="512m")

# Each instance has nominal class and numeric attributes
loader = Loader(classname="weka.core.converters.ArffLoader")
trainData = loader.load_file('segment-challenge.arff')
trainData.class_is_last()
testData = loader.load_file('segment-test.arff')
testData.class_is_last()

# Default C4.5 tree
classifier = Classifier(classname="weka.classifiers.trees.J48")

# Search for the best parameters and build a classifier with them
classifier.build_classifier(trainData)

print("\n\n=========== Classifier information ================\n\n")
print(classifier.options)
print(classifier)

print("\n\n=========== Train results ================\n\n")
evaluation = Evaluation(trainData)
evaluation.test_model(classifier, trainData)
print(classifier.to_commandline())
print(evaluation.matrix())
示例#57
0
 def __init__(self, dataDir = '.'):
     self.dataDir = dataDir 
     
     jvm.start()
    mparam.expression = "pow(BASE,I)"
    lparam = ListParameter()
    lparam.prop = "classifier.C"
    lparam.values = ["-2.0", "-1.0", "0.0", "1.0", "2.0"]
    multi.parameters = [mparam, lparam]
    cls = Classifier(
        classname="weka.classifiers.functions.SMOreg",
        options=["-K", "weka.classifiers.functions.supportVector.RBFKernel"])
    multi.classifier = cls
    multi.build_classifier(train)
    print("Model:\n" + str(multi))
    print("\nBest setup:\n" + multi.best.to_commandline())


def main():
    """
    Calls the parameter optimization method(s).
    """
    #gridsearch()
    multisearch()


if __name__ == "__main__":
    try:
        jvm.start(packages=True)
        main()
    except Exception, e:
        print(traceback.format_exc())
    finally:
        jvm.stop()
import weka.core.jvm as jvm
jvm.start()

jvm.start(system_cp=True, packages=True)
jvm.start(packages="/usr/local/lib/python2.7/dist-packages/weka")

jvm.start(max_heap_size="512m")

data_dir="CSDMC2010_SPAM/CSDMC2010_SPAM/TRAINING"


from  weka.classifiers  import  Classifier 
cls = Classifier(classname="weka.classifiers.trees.J48")
cls.options = ["-C", "0.3"]
print(cls.options)


jvm.stop()
示例#60
0
from utilities import *
import weka.core.jvm as jvm

from weka.core.converters import Loader, Saver

from weka.classifiers import Classifier, Evaluation
from weka.core.classes import Random

jvm.start(max_heap_size="3072m")

loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file("./Dataset/trainGrid.arff")
data.class_is_last()

#classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"])
classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes")

evaluation = Evaluation(data)
#evaluation.crossvalidate_model(classifier, data, 10, Random(42))
evaluation.evaluate_train_test_split(classifier, data, 66, Random(42))
res = evaluation.summary()
res += "\n" + evaluation.matrix()
#f = open('./Dataset/resultsGrid.txt', 'w')
#f.write(res)

print res

jvm.stop()