def mixed_effect_chisq(): k=5000 X_path="../pickle_dir\\summary\\X_summary_pickle" y_path="../pickle_dir\\summary\\y_summary_pickle" vocab_vect_path="../pickle_dir\\summary\\vocab_vectorizer_summary_pickle" pickle_x_mixed_path="../pickle_dir/summary/X_mixed_summary_pickle" feat_selector=SelectKBest(chi2,k) X_mixed=_generate_mixed_effect_matrix(X_path,y_path,feat_selector) pickle_obj(X_mixed,pickle_x_mixed_path)
def db_to_pickle(src_db,secondary_path=""): """ Convert database data to pickle file of X,y, and ref_index. The database object must have the following properties ref_index: callback int assignment to URLToGenre attr_map: dictionary of attribute to count short_genres: Genres for each attr_map, aka its label(s) Store in PICKLE_DIR/$secondary_path/ :return: X,y,ref_index y is nx1 where n is number of labels. It is an np array of lists where list are all the genres an instance may have. """ if secondary_path == "": print("No secondary path set.") vocabulary_set=set() for all_gram_obj in src_db.objects.no_cache(): vocabulary_set |=set(all_gram_obj.attr_map.keys()) del all_gram_obj print("The size of the url vocabulary: {}".format(len(vocabulary_set))) vocabulary_dict=coll.Counter((i for i in vocabulary_set)) print("Fitting vocabulary") vectorizer=DictVectorizer() vectorizer.fit([vocabulary_dict]) print("Transforming") stack=500 X_stack=[] y_stack=[] ref_index=[] for c,all_gram_obj in enumerate(src_db.objects.no_cache()): c%10000==0 and print(c) X_stack.append(all_gram_obj.attr_map) y_stack.append(all_gram_obj.short_genres) ref_index.append(all_gram_obj.ref_index) del all_gram_obj X=vectorizer.transform(X_stack) y=np.array(y_stack) ref_index=np.array(ref_index) #store x,y, and ref_index into pickle dir_path=os.path.join(settings.PICKLE_DIR,secondary_path) os.makedirs(dir_path,exist_ok=True) X_path=os.path.join(dir_path,"X_{}_pickle".format(secondary_path)) y_path=os.path.join(dir_path,"y_{}_pickle".format(secondary_path)) ref_path=os.path.join(dir_path,"refIndex_{}_pickle".format(secondary_path)) vectorizer_path=os.path.join(dir_path,"vocab_vectorizer_{}_pickle".format(secondary_path)) pickle_obj(X,X_path) pickle_obj(y,y_path) pickle_obj(ref_index,ref_path) pickle_obj(vectorizer,vectorizer_path)
def unsupervised(settings,train_set,clusterer,clustering_alg_cls): clustering_logger.info("Unsupervised Algorithm training size: {}".format(train_set.X.shape)) for num_cluster in sorted(settings.num_clusters,reverse=True): X,y,ref_ids=train_set.to_matrices() additional_notes="" if train_set.X.shape[0]<=settings.spectre_clustering_limit: clustering_alg=AgglomerativeClustering(n_clusters=num_cluster) additional_notes="_agglomerative" X=X.toarray() else: clustering_alg=clustering_alg_cls(n_clusters=num_cluster) clustering_logger.info("Using {}".format(str(clustering_alg)+additional_notes)) res_labels=clustering_alg.fit_predict(X) occurence_dict=clusterer.get_clusters_genre_distribution(y,res_labels) #the directory to store the results of clustering res_dir=os.path.join(UNSUPERVISED_DIR,settings.clustering_alg,*settings.parent_clusters) os.makedirs(res_dir,exist_ok=True) #ELIMATE CLUSTER LESS THAN 2 pages in size for cluster_name, cluster_genre_count in list(occurence_dict.items()): total_count_in_cluster=sum((count for genre,count in cluster_genre_count.items())) if total_count_in_cluster < 12: del occurence_dict[cluster_name] else: path=os.path.join(res_dir,"{}_{}_pages".format(num_cluster,cluster_name)) #OUTPUT the pages in the current cluster clusterer.output_pages_in_cluster(path,train_set.ref_index[res_labels==cluster_name]) res_file="{}/{}.pdf".format(res_dir,str(num_cluster)) clusterer.generate_cluster_distribution_graphs(res_file,occurence_dict,res_labels) #output closeness metrics if additional_notes=="": inter_cluster,inter_cluster_count,intra_cluster,intra_cluster_count=Clustering().cluster_closeness(clustering_alg.cluster_centers_,X,res_labels) clusterer.output_cluster_closeness("{}/{}.txt".format(res_dir,num_cluster),inter_cluster, inter_cluster_count,intra_cluster,intra_cluster_count) #do a dfs on clusters bigger than the prescribed size if settings.break_up_clusters: breakup_candidate=[] for i in range(0,num_cluster): if np.sum(res_labels==i)>=settings.max_cluster_size: breakup_candidate.append(i) X_path=os.path.join(res_dir,"X") y_path=os.path.join(res_dir,"y") ref_indexes_path=os.path.join(res_dir,"ref_indexes") clustering_logger.info("Pickling X,y,ref_index to conserve memory") pickle_obj(train_set.X,X_path) pickle_obj(train_set.y,y_path) pickle_obj(train_set.ref_index,ref_indexes_path) for cluster_name in breakup_candidate: clustering_logger.info("Breaking up cluster {} of size greater than {}".format(cluster_name,settings.max_cluster_size)) settings.parent_clusters.append("{}_{}".format(num_cluster,cluster_name)) selector=(res_labels==cluster_name) train_set.X=train_set.X[selector] train_set.y=train_set.y[selector] train_set.ref_index=train_set.ref_index[selector] unsupervised(settings,train_set,clusterer,clustering_alg_cls) settings.parent_clusters.pop() train_set.X=unpickle_obj(X_path) train_set.y=unpickle_obj(y_path) train_set.ref_index=unpickle_obj(ref_indexes_path) #remove the cache files os.remove(ref_indexes_path) os.remove(X_path) os.remove(y_path)