예제 #1
0
def mixed_effect_chisq():
    k=5000
    X_path="../pickle_dir\\summary\\X_summary_pickle"
    y_path="../pickle_dir\\summary\\y_summary_pickle"
    vocab_vect_path="../pickle_dir\\summary\\vocab_vectorizer_summary_pickle"
    pickle_x_mixed_path="../pickle_dir/summary/X_mixed_summary_pickle"

    feat_selector=SelectKBest(chi2,k)

    X_mixed=_generate_mixed_effect_matrix(X_path,y_path,feat_selector)

    pickle_obj(X_mixed,pickle_x_mixed_path)
예제 #2
0
def db_to_pickle(src_db,secondary_path=""):
    """
    Convert database data to pickle file of X,y, and ref_index.

    The database object must have the following properties
        ref_index: callback int assignment to URLToGenre
        attr_map: dictionary of attribute to count
        short_genres: Genres for each attr_map, aka its label(s)

    Store in PICKLE_DIR/$secondary_path/
    :return: X,y,ref_index
        y is nx1 where n is number of labels. It is an np array of lists where list are all the genres an instance may
            have.
    """

    if secondary_path == "":
        print("No secondary path set.")

    vocabulary_set=set()
    for all_gram_obj in src_db.objects.no_cache():
        vocabulary_set |=set(all_gram_obj.attr_map.keys())
        del all_gram_obj

    print("The size of the url vocabulary: {}".format(len(vocabulary_set)))
    vocabulary_dict=coll.Counter((i for i in vocabulary_set))

    print("Fitting vocabulary")
    vectorizer=DictVectorizer()
    vectorizer.fit([vocabulary_dict])

    print("Transforming")
    stack=500
    X_stack=[]
    y_stack=[]
    ref_index=[]
    for c,all_gram_obj in enumerate(src_db.objects.no_cache()):
        c%10000==0 and print(c)

        X_stack.append(all_gram_obj.attr_map)
        y_stack.append(all_gram_obj.short_genres)
        ref_index.append(all_gram_obj.ref_index)
        del all_gram_obj

    X=vectorizer.transform(X_stack)
    y=np.array(y_stack)
    ref_index=np.array(ref_index)

    #store x,y, and ref_index into pickle
    dir_path=os.path.join(settings.PICKLE_DIR,secondary_path)

    os.makedirs(dir_path,exist_ok=True)

    X_path=os.path.join(dir_path,"X_{}_pickle".format(secondary_path))
    y_path=os.path.join(dir_path,"y_{}_pickle".format(secondary_path))
    ref_path=os.path.join(dir_path,"refIndex_{}_pickle".format(secondary_path))
    vectorizer_path=os.path.join(dir_path,"vocab_vectorizer_{}_pickle".format(secondary_path))

    pickle_obj(X,X_path)
    pickle_obj(y,y_path)
    pickle_obj(ref_index,ref_path)
    pickle_obj(vectorizer,vectorizer_path)
예제 #3
0
def unsupervised(settings,train_set,clusterer,clustering_alg_cls):
    clustering_logger.info("Unsupervised Algorithm training size: {}".format(train_set.X.shape))

    for num_cluster in sorted(settings.num_clusters,reverse=True):

        X,y,ref_ids=train_set.to_matrices()

        additional_notes=""

        if train_set.X.shape[0]<=settings.spectre_clustering_limit:
            clustering_alg=AgglomerativeClustering(n_clusters=num_cluster)
            additional_notes="_agglomerative"
            X=X.toarray()
        else:
            clustering_alg=clustering_alg_cls(n_clusters=num_cluster)

        clustering_logger.info("Using {}".format(str(clustering_alg)+additional_notes))

        res_labels=clustering_alg.fit_predict(X)

        occurence_dict=clusterer.get_clusters_genre_distribution(y,res_labels)

        #the directory to store the results of clustering
        res_dir=os.path.join(UNSUPERVISED_DIR,settings.clustering_alg,*settings.parent_clusters)
        os.makedirs(res_dir,exist_ok=True)

        #ELIMATE CLUSTER LESS THAN 2 pages in size
        for cluster_name, cluster_genre_count in list(occurence_dict.items()):
            total_count_in_cluster=sum((count for genre,count in cluster_genre_count.items()))

            if total_count_in_cluster < 12:
                del occurence_dict[cluster_name]
            else:
                path=os.path.join(res_dir,"{}_{}_pages".format(num_cluster,cluster_name))
                #OUTPUT the pages in the current cluster
                clusterer.output_pages_in_cluster(path,train_set.ref_index[res_labels==cluster_name])


        res_file="{}/{}.pdf".format(res_dir,str(num_cluster))



        clusterer.generate_cluster_distribution_graphs(res_file,occurence_dict,res_labels)

        #output closeness metrics
        if additional_notes=="":
            inter_cluster,inter_cluster_count,intra_cluster,intra_cluster_count=Clustering().cluster_closeness(clustering_alg.cluster_centers_,X,res_labels)
            clusterer.output_cluster_closeness("{}/{}.txt".format(res_dir,num_cluster),inter_cluster,
                                               inter_cluster_count,intra_cluster,intra_cluster_count)

        #do a dfs on clusters bigger than the prescribed size
        if settings.break_up_clusters:
            breakup_candidate=[]

            for i in range(0,num_cluster):
                if np.sum(res_labels==i)>=settings.max_cluster_size:
                    breakup_candidate.append(i)

            X_path=os.path.join(res_dir,"X")
            y_path=os.path.join(res_dir,"y")
            ref_indexes_path=os.path.join(res_dir,"ref_indexes")

            clustering_logger.info("Pickling X,y,ref_index to conserve memory")
            pickle_obj(train_set.X,X_path)
            pickle_obj(train_set.y,y_path)
            pickle_obj(train_set.ref_index,ref_indexes_path)

            for cluster_name in breakup_candidate:
                clustering_logger.info("Breaking up cluster {} of size greater than {}".format(cluster_name,settings.max_cluster_size))

                settings.parent_clusters.append("{}_{}".format(num_cluster,cluster_name))

                selector=(res_labels==cluster_name)

                train_set.X=train_set.X[selector]
                train_set.y=train_set.y[selector]
                train_set.ref_index=train_set.ref_index[selector]

                unsupervised(settings,train_set,clusterer,clustering_alg_cls)

                settings.parent_clusters.pop()

                train_set.X=unpickle_obj(X_path)
                train_set.y=unpickle_obj(y_path)
                train_set.ref_index=unpickle_obj(ref_indexes_path)

            #remove the cache files
            os.remove(ref_indexes_path)
            os.remove(X_path)
            os.remove(y_path)