예제 #1
0
파일: X_y.py 프로젝트: wangk1/research
def load_X_y_refIndex(path_X,path_y,path_refId):
    """
    Load X,y, and refIndex from their respective pickle directory.

    :param feature_type:
    :param pickle_dir:
    :return:
    """

    X=unpickle_obj(path_X)
    y=unpickle_obj(path_y)
    ref_id=unpickle_obj(path_refId)

    return X,y,ref_id
예제 #2
0
def _generate_mixed_effect_matrix(X_path,y_path,feat_selector):
    """
    Converts X to a COO Matrix of Mixed effect matrix

    :param X_path:
    :param y_path:
    :param feat_selector:
    :return:
    """

    mixed_effect_logger.debug("Flattening")

    #Reduce the column count
    X,y,_=flatten_set(*random_pick_samples(unpickle_obj(X_path),genre_normalizer(unpickle_obj(y_path))))
    feat_selector.fit(X,y)

    mixed_effect_logger.debug("Final size of X: {} y:{}".format(X.shape,y.shape))

    #Get the column selector, indices
    vocab_selector=feat_selector.get_support(True)
    num_vocab=vocab_selector.shape[0]

    vstack_list=[0]*X.shape[0]
    for ind,X_row in enumerate(X):
        ind % 10==0 and mixed_effect_logger.info("Done with {}".format(ind))

        row=np.zeros((1,num_vocab**2))
        select_col=X_row[0,vocab_selector].toarray() #convert to dense rep.

        #Compare each index to each row. Record the minimum as cooccurence
        for col_ind in range(0,select_col.shape[1]):
            if not select_col[0,col_ind]:
                continue

            cmp=np.full((1,select_col.shape[1]),fill_value=select_col[0,col_ind])
            select_col=np.minimum(select_col,cmp)
            row[0,col_ind*num_vocab:(col_ind+1)*num_vocab]=select_col

        vstack_list[ind]=lil_matrix(row)
        del row,select_col

    return vstack(vstack_list).tocoo()
예제 #3
0
def num_genre_per_webpage(matrix_path):
    """
    Create a box plot of how many other genres each webpage has for each genre

    Also, record the occurence of genres with each other
    :param matrix_path:
    :return:
    """

    label_matrix=unpickle_obj(matrix_path)

    genre_to_num_webpages=coll.defaultdict(lambda:[])

    for webpage_genre in label_matrix:

        normalized_genre=set([normalize_genre_string(g,1) for g in webpage_genre])

        for g in normalized_genre:
            if g in bad_genre_set:
                continue

            #if normalized_genre-{g}:
            genre_to_num_webpages[g].append(normalized_genre-{g})


    #box plot it
    genre_to_num_item_iter=genre_to_num_webpages.items()

    plt.clf()
    plt.figure(1)

    plt.xticks([i for i in range(0,len(genre_to_num_item_iter))],[op.itemgetter(0)(i) for i in genre_to_num_item_iter])
    plt.yticks(range(0,6))
    plt.tick_params(axis="both",which="major",labelsize=5)

    for c,(g,counts) in enumerate(genre_to_num_item_iter):
        add_bar_plot(c,[ len(gs) for gs in counts])

    plt.savefig("C:\\Users\\Kevin\\Desktop\\GitHub\\Research\\Webscraper\\classification_res\\genre_analysis\\genre_dist.pdf")
    #print
    print(genre_to_num_webpages)
예제 #4
0
        for (w,c) in sorted_list:
            file.write("{}, {}\n".format(w,c))




if __name__=="__main__":
    dmoz_alexa_similarity()
    exit(0)
    path="C:\\Users\\Kevin\\Desktop\\GitHub\\Research\\Webscraper\\classification_res\\summary_chi_top1cls_10000"
    outpath="C:\\Users\\Kevin\\Desktop\\GitHub\\Research\\Webscraper\\classification_res\\summary_2000_chi2\\miss_plt"

    y_path="C:\\Users\\Kevin\\Desktop\\GitHub\\Research\\Webscraper\\pickle_dir\\y_summary_pickle"

    y=unpickle_obj(y_path)

    tabulate_genre_dist(y)

    #num_genre_per_webpage("C:\\Users\\Kevin\\Desktop\\GitHub\\Research\\Webscraper\\pickle_dir\\y_summary_pickle")


    #dmoz_alexa_similarity()
    # #prob_dict=load_prob_dict()
    #for i in range(1,5):
        #consensus_count,consensus_total=consensus_class_per_genre(path,filter_func=lambda x:len(x)==i)
        #plot_consensus_percentile(consensus_count,consensus_total)
    #multi_class_misprediction_freq(path)

    #plot_miss_per_genre(path,outpath,classifiers="LogisticRegression")
예제 #5
0
__author__ = 'Kevin'
import itertools

from analytics.classification_results.res_iterator import RightResultsIter,WrongResultsIter
from misc_scripts.assign_ref_index import global_ref_id
from misc_scripts.remove_summary_duplicates import remove_summary_duplicates_in_urlbow
from data.util import unpickle_obj
from util.base_util import normalize_genre_string

if __name__=="__main__":
    y=unpickle_obj("C:/Users/wangk1/Desktop/Research/research/pickle_dir/summary/y_summary_pickle")

    num=0
    for y_i in y:
        if len(set(normalize_genre_string(n) for n in y_i))==2:
            num+=1
    print(num)

    #remove_summary_duplicates_in_urlbow()

    #assign_ref_index_to_each_url()

    """
    path="C:\\Users\\Kevin\\Desktop\\GitHub\\Research\\Webscraper\\classification_res\\summary_chi_top4cls_10000"
    #path="C:\\Users\\Kevin\\Desktop\\GitHub\\Research\\Webscraper\\classification_res\\summary_100_chi_truncated_lsa"
    classifier="LogisticRegression"
    num_top=1

    correct=0
    wrong=0
예제 #6
0
def unsupervised(settings,train_set,clusterer,clustering_alg_cls):
    clustering_logger.info("Unsupervised Algorithm training size: {}".format(train_set.X.shape))

    for num_cluster in sorted(settings.num_clusters,reverse=True):

        X,y,ref_ids=train_set.to_matrices()

        additional_notes=""

        if train_set.X.shape[0]<=settings.spectre_clustering_limit:
            clustering_alg=AgglomerativeClustering(n_clusters=num_cluster)
            additional_notes="_agglomerative"
            X=X.toarray()
        else:
            clustering_alg=clustering_alg_cls(n_clusters=num_cluster)

        clustering_logger.info("Using {}".format(str(clustering_alg)+additional_notes))

        res_labels=clustering_alg.fit_predict(X)

        occurence_dict=clusterer.get_clusters_genre_distribution(y,res_labels)

        #the directory to store the results of clustering
        res_dir=os.path.join(UNSUPERVISED_DIR,settings.clustering_alg,*settings.parent_clusters)
        os.makedirs(res_dir,exist_ok=True)

        #ELIMATE CLUSTER LESS THAN 2 pages in size
        for cluster_name, cluster_genre_count in list(occurence_dict.items()):
            total_count_in_cluster=sum((count for genre,count in cluster_genre_count.items()))

            if total_count_in_cluster < 12:
                del occurence_dict[cluster_name]
            else:
                path=os.path.join(res_dir,"{}_{}_pages".format(num_cluster,cluster_name))
                #OUTPUT the pages in the current cluster
                clusterer.output_pages_in_cluster(path,train_set.ref_index[res_labels==cluster_name])


        res_file="{}/{}.pdf".format(res_dir,str(num_cluster))



        clusterer.generate_cluster_distribution_graphs(res_file,occurence_dict,res_labels)

        #output closeness metrics
        if additional_notes=="":
            inter_cluster,inter_cluster_count,intra_cluster,intra_cluster_count=Clustering().cluster_closeness(clustering_alg.cluster_centers_,X,res_labels)
            clusterer.output_cluster_closeness("{}/{}.txt".format(res_dir,num_cluster),inter_cluster,
                                               inter_cluster_count,intra_cluster,intra_cluster_count)

        #do a dfs on clusters bigger than the prescribed size
        if settings.break_up_clusters:
            breakup_candidate=[]

            for i in range(0,num_cluster):
                if np.sum(res_labels==i)>=settings.max_cluster_size:
                    breakup_candidate.append(i)

            X_path=os.path.join(res_dir,"X")
            y_path=os.path.join(res_dir,"y")
            ref_indexes_path=os.path.join(res_dir,"ref_indexes")

            clustering_logger.info("Pickling X,y,ref_index to conserve memory")
            pickle_obj(train_set.X,X_path)
            pickle_obj(train_set.y,y_path)
            pickle_obj(train_set.ref_index,ref_indexes_path)

            for cluster_name in breakup_candidate:
                clustering_logger.info("Breaking up cluster {} of size greater than {}".format(cluster_name,settings.max_cluster_size))

                settings.parent_clusters.append("{}_{}".format(num_cluster,cluster_name))

                selector=(res_labels==cluster_name)

                train_set.X=train_set.X[selector]
                train_set.y=train_set.y[selector]
                train_set.ref_index=train_set.ref_index[selector]

                unsupervised(settings,train_set,clusterer,clustering_alg_cls)

                settings.parent_clusters.pop()

                train_set.X=unpickle_obj(X_path)
                train_set.y=unpickle_obj(y_path)
                train_set.ref_index=unpickle_obj(ref_indexes_path)

            #remove the cache files
            os.remove(ref_indexes_path)
            os.remove(X_path)
            os.remove(y_path)
예제 #7
0
파일: X_y.py 프로젝트: wangk1/research
def load_X_y(path_X,path_y):
    return unpickle_obj(path_X), unpickle_obj(path_y)
예제 #8
0
    #CLASSIFICATION, adjust weights
    classifier_util=ClassifierUtil()

    """
    LOAD DATA, preprocess
    """

    #WARNING: REF INDEX for each individual X set must match row to row
    Xs=[]
    ys=[]
    ref_indexes_unmatched=[]
    ref_indexes=[]

    for setting in settings:
        supervised_logger.info("Loading data for {}".format(setting))
        X=unpickle_obj("pickle_dir\\{}\\X_{}_pickle".format(setting.feature_selection,setting.feature_selection))
        ref_index=unpickle_obj("pickle_dir\\{}\\refIndex_{}_pickle".format(*itertools.repeat(setting.feature_selection,2)))
        y=unpickle_obj("pickle_dir\\{}\\y_{}_pickle".format(*itertools.repeat(setting.feature_selection,2)))
        y=np.array([list(set((normalize_genre_string(g,1) for g in g_list))) for g_list in y])

        #filter out unwanted genres
        X_filtered,y_filtered,ref_index_filtered=filter_genres(X,y,ref_index,ignore_genre)
        ref_indexes_unmatched.append(ref_index_filtered)
        Xs.append(X_filtered)
        ys.append(y_filtered)

    #match refids
    supervised_logger.info("Making ref indexes match for the data sets")
    Xs,ys,ref_indexes=match_sets_based_on_ref_id(Xs,ys,ref_indexes_unmatched)

    #make sure ref indexes match