def main_with_explore(): classes_properties_uris = easysparql.get_all_classes_properties_numerical(RAW_ENDPOINT) data, meta_data = data_extraction.data_and_meta_from_class_property_uris(class_property_uris=classes_properties_uris) if np.any(np.isnan(data)): print "there is a nan in the data" print "**************************" else: print "no nans in the data" data_extraction.save_data_and_meta_to_files(data=data, meta_data=meta_data) model = learning.train_with_data_and_meta(data=data, meta_data=meta_data) meta_with_clusters = learning.get_cluster_for_meta(training_meta=meta_data, testing_meta=meta_data) #print "model num_of_clusters: %d" % model.n_clusters #print "cluster centers: %s" % str(model.cluster_centers_) learning.test_with_data_and_meta(model=model, data=data, meta_data=meta_with_clusters)
def main_with_class_explore(): class_uri = 'http://dbpedia.org/ontology/Person' properties = easysparql.get_numerical_properties_for_class_tbox(endpoint=RAW_ENDPOINT, class_uri=class_uri) if properties is None: return class_property_combinations = zip((len(properties) * [class_uri]), properties) # print class_property_combinations data, meta_data = data_extraction.data_and_meta_from_class_property_uris( class_property_uris=class_property_combinations) # data_extraction.save_data_and_meta_to_files(data=data, meta_data=meta_data) model = learning.train_with_data_and_meta(data=data, meta_data=meta_data) meta_with_clusters = learning.get_cluster_for_meta(training_meta=meta_data, testing_meta=meta_data) learning.test_with_data_and_meta(model=model, data=data, meta_data=meta_with_clusters) # now testing some files test_data, test_meta_data = data_extraction.data_and_meta_from_files(['person_waist.csv','person_hipsize.csv', 'person_bustsize.csv']) learning.predict(model, data=test_data, meta_data=test_meta_data)
def main(): class_property_combinations = [ ('http://xmlns.com/foaf/0.1/Person', 'http://dbpedia.org/ontology/numberOfMatches'), # ('http://schema.org/Place', 'http://dbpedia.org/property/longew'), # ('http://schema.org/Place', 'http://dbpedia.org/property/latns'), ('http://schema.org/Place', 'http://www.georss.org/georss/point'), # ('http://schema.org/Place', 'http://dbpedia.org/property/latm'), # ('http://schema.org/Place', 'http://dbpedia.org/property/longm'), ('http://schema.org/Place', 'http://dbpedia.org/property/latd'), ('http://schema.org/Place', 'http://dbpedia.org/property/longd'), ] class_property_combinations_test = [ # ('http://schema.org/Place', 'http://dbpedia.org/property/latm'), # ('http://schema.org/Place', 'http://dbpedia.org/property/longm'), ('http://schema.org/Place', 'http://dbpedia.org/property/latd'), ('http://schema.org/Place', 'http://dbpedia.org/property/longd'), ] data1, meta_data1 = data_extraction.data_and_meta_from_class_property_uris(class_property_combinations) data2, meta_data2 = data_extraction.data_and_meta_from_files(['novHighC.csv']) data, meta_data = data_manipulation.merge_data_and_meta_naive(data1=data1, meta_data1=meta_data1, data2=data2, meta_data2=meta_data2) for clus, md in enumerate(meta_data): print "cluster %d => type: %s" % (clus, md["type"]) model = learning.train_with_data_and_meta(data=data, meta_data=meta_data) test_data1, test_meta_data1 = data_extraction.data_and_meta_from_class_property_uris( class_property_combinations_test) test_data2, test_meta_data2 = data_extraction.data_and_meta_from_files(['mayHighC.csv']) # merge the two data sets test_data, test_meta_data = data_manipulation.merge_data_and_meta_naive( data1=test_data1, meta_data1=test_meta_data1, data2=test_data2, meta_data2=test_meta_data2) # test_meta_data_with_clusters = learning.get_cluster_for_meta(training_meta=meta_data, testing_meta=test_meta_data) # learning.test_with_data_and_meta(model=model, data=test_data, meta_data=test_meta_data_with_clusters) learning.predict(model=model, data=test_data, meta_data=test_meta_data)
def explore_and_train_tbox(endpoint=None, model_id=None): if endpoint is None: print "explore_and_train_tbox> endpoint is None" return if model_id is None: print "explore_and_train_tbox> model_id should not be None" return try: update_progress_func = partial(update_model_progress_for_partial, model_id) update_model_state( model_id=model_id, new_state=MLModel.RUNNING, new_progress=0, new_notes="Extracting numerical class/property combinations") # Safe function classes_properties_uris = easysparql.get_all_classes_properties_numerical( endpoint=endpoint) update_model_state( model_id=model_id, new_progress=0, new_notes="extracting values from gathered class/property") data, meta_data = data_extraction.data_and_meta_from_class_property_uris( endpoint=endpoint, class_property_uris=classes_properties_uris, update_func=update_progress_func, isnumericfilter=True) update_model_state(model_id=model_id, new_progress=0, new_notes="training the model") if data is None: update_model_state( model_id=model_id, new_progress=0, new_state=MLModel.STOPPED, new_notes="No data is extracted from the endpoint") return if np.any(np.isnan(data)): print "explore_and_train_tbox> there is a nan in the data" print "**************************" else: print "explore_and_train_tbox> no nans in the data" model = learning.train_with_data_and_meta( data=data, meta_data=meta_data, update_func=update_progress_func) update_model_state(model_id=model_id, new_progress=0, new_notes="organizing the clusters") meta_with_clusters = learning.get_cluster_for_meta( training_meta=meta_data, testing_meta=meta_data, update_func=update_progress_func) update_model_state(model_id=model_id, new_progress=0, new_notes="Saving the model data") model_file_name = data_extraction.save_model(model=model, meta_data=meta_data, file_name=str(model_id) + " - ") if model_file_name is not None: m = MLModel.objects.filter(id=model_id) if len(m) == 1: m = m[0] m.file_name = model_file_name m.save() update_model_state(model_id=model_id, new_progress=100, new_state=MLModel.COMPLETE, new_notes="Completed") else: update_model_state(model_id=model_id, new_progress=0, new_state=MLModel.STOPPED, new_notes="model is deleted") else: update_model_state(model_id=model_id, new_progress=0, new_state=MLModel.STOPPED, new_notes="Error Saving the model") except Exception as e: print "explore_and_train_tbox> Exception %s" % str(e) traceback.print_exc() update_model_state(model_id=model_id, new_state=MLModel.STOPPED, new_notes="Not captured error: " + str(e))
def main_with_local_files(): data, meta_data = data_extraction.data_and_meta_from_files(get_local_dbpedia_files()) model = learning.train_with_data_and_meta(data=data, meta_data=meta_data) meta_data_with_clusters = learning.get_cluster_for_meta(training_meta=meta_data, testing_meta=meta_data) learning.test_with_data_and_meta(model=model, data=data, meta_data=meta_data_with_clusters)
def explore_and_train_abox(endpoint=None, model_id=None, classes_uris=[], min_num_of_objects=90): if endpoint is None: print "explore_and_train_abox> endpoint is None" return if model_id is None: print "explore_and_train_abox> model_id should not be None" return try: update_progress_func = partial(update_model_progress_for_partial, model_id) update_model_state( model_id=model_id, new_state=MLModel.RUNNING, new_progress=0, new_notes="Extracting numerical class/property combinations") classes_properties_uris = [] for idx, class_uri in enumerate(classes_uris): update_progress_func(int(idx * 1.0 / len(classes_uris) * 100)) # properties = easysparql.get_numerical_properties_for_class_abox(endpoint=endpoint, class_uri=class_uri, # raiseexception=True) # properties = easysparql.get_numerical_properties_for_class_abox_using_half_split(endpoint=endpoint, # class_uri=class_uri, # raiseexception=True, # lower_bound=1, # upper_bound=100000, # first_time=True) properties = easysparql.get_properties_for_class_abox( endpoint=endpoint, class_uri=class_uri, raiseexception=True) for prop in properties: classes_properties_uris.append((class_uri, prop)) update_progress_func(100) update_model_state( model_id=model_id, new_progress=0, new_notes="extracting values from gathered class/property") data, meta_data = data_extraction.data_and_meta_from_class_property_uris( endpoint=endpoint, class_property_uris=classes_properties_uris, update_func=update_progress_func, isnumericfilter=False, min_num_of_objects=min_num_of_objects) update_model_state(model_id=model_id, new_progress=0, new_notes="training the model") if data is None: update_model_state( model_id=model_id, new_progress=0, new_state=MLModel.STOPPED, new_notes="No data is extracted from the endpoint") return if np.any(np.isnan(data)): print "explore_and_train_abox> there is a nan in the data" print "**************************" else: print "explore_and_train_abox> no nans in the data" model = learning.train_with_data_and_meta( data=data, meta_data=meta_data, update_func=update_progress_func) if model is None: update_model_state(model_id=model_id, new_state=MLModel.STOPPED, new_notes="leaning failed as model is None") return update_model_state(model_id=model_id, new_progress=0, new_notes="organizing the clusters") meta_with_clusters = learning.get_cluster_for_meta( training_meta=meta_data, testing_meta=meta_data, update_func=update_progress_func) # Now I'm not using the computed data here # update_model_state(model_id=model_id, new_progress=0, new_notes="computing the score of the trained model") # learning.test_with_data_and_meta(model=model, data=data, meta_data=meta_with_clusters, # update_func=update_progress_func) update_model_state(model_id=model_id, new_progress=0, new_notes="Saving the model data") model_file_name = data_extraction.save_model(model=model, meta_data=meta_data, file_name=str(model_id) + " - ") if model_file_name is not None: m = MLModel.objects.filter(id=model_id) if len(m) == 1: m = m[0] m.file_name = model_file_name m.save() update_model_state(model_id=model_id, new_progress=100, new_state=MLModel.COMPLETE, new_notes="Completed") else: update_model_state(model_id=model_id, new_progress=0, new_state=MLModel.STOPPED, new_notes="model is deleted") else: update_model_state(model_id=model_id, new_progress=0, new_state=MLModel.STOPPED, new_notes="Error Saving the model") except Exception as e: print "explore_and_train_abox> Exception %s" % str(e) traceback.print_exc() update_model_state(model_id=model_id, new_state=MLModel.STOPPED, new_notes="Raised error: " + str(e))