示例#1
0
def URLToGenre_to_bow():
    bow=word_based.BagOfWords()

    all_ref_index=set(i.ref_index for i in URLBow.objects.no_cache())

    URLBow_fulltxt.objects.delete()
    for c,db_obj in enumerate(URLToGenre.objects.no_cache()):
        if c%1000==0:
            print("Done with {}".format(c))

        if not hasattr(db_obj,"original") or not db_obj.original \
            or not hasattr(db_obj,"page") or not db_obj.page:
            continue

        ref_id=db_obj.ref_index
        if ref_id not in all_ref_index:
            continue

        page=db_obj.page

        if isinstance(page,list):
            print("{} is a list updating".format(ref_id))
            page="".join(page)
            #db_obj.update(page=page)

        try:
            word_dict=bow.get_word_count(page)
        except:
            print("Skipped {}".format(ref_id))
            continue
        short_genres=[normalize_genre_string(genre.genre) for genre in db_obj.genre]

        URLBow_fulltxt(ref_index=ref_id,attr_map=word_dict,short_genres=short_genres,\
                       url=db_obj.url).save()
示例#2
0
def full_page_bow():
    """
    Creates bow of the entire url pages present in URLToGenre that are original pages.

    :return:
    """
    queue=DBQueue(Queue_full_page,"full_page_bow_queue")
    bow_model=BagOfWords()


    for number in range(queue.get_location(),Queue_full_page.objects.count()):
        queue_obj=Queue_full_page.objects.get(number=number)

        url_obj=URLToGenre.objects.get(url=queue_obj.url)

        if number % 1000==0:
            print(number)

        try:

            bow=bow_model.get_word_count(url_obj.page)

            if url_obj.page.strip()=="":
                raise Exception("Bad Page")
        except Exception as ex:
            with open("bad_full_url.txt",mode="a") as out:
                out.write("{}:::{}\n".format(number,str(ex)))
                queue.increment_location()
                continue

        URLBow_fulltxt(bow=bow,bow_index=queue_obj.number,short_genres=[normalize_genre_string(genre.genre,2)
                                                                            for genre in url_obj.genre]).save()
        queue.increment_location()
示例#3
0
    def actual(self):
        """
        Get all the genres of the instance. Normalized to the level of self.genre_lv

        :return:
        """
        return [normalize_genre_string(g,self.genre_lv) for g in self.__actual]
示例#4
0
def start_bow():
    """
    Bag of word all webpages in URLToGenre database

    Pipeline:
    1. Get genre and page from URLToGenre Object from the mongodb URLQueue
    2. BOW the webpage in URLToGenre Object
    3. Shorten the Genre
    4. Insert the words in bow into the genre in CategoricalBow Mongodb table

    Repeat until exhaustion of URLToGenre Objects

    :return: Nothing!
    """

    queue=DBQueue(None,"summarization")

    #don't trust anything
    summarizer=Summarizer()
    bow=BagOfWords()

    for url_obj in URLToGenre.objects.order_by("ref_index").no_cache():

        try:
            print('New url {}'.format(url_obj.ref_index))

            if not hasattr(url_obj,"original") or not url_obj["original"]:
                continue

            #skip conditionsL it does not have page or it is not an original url
            if not hasattr(url_obj,'page'):
                raise Exception('url {} No has page'.format(url_obj.ref_index))

            #get genre strings
            #register the genre with the short genres for faster retrieval
            genre_string_list=[]
            for g in url_obj.genre:
                normalized_string=base_util.normalize_genre_string(g["genre"])
                genre_string_list.append(normalized_string)

            genre_string_list=list(set(genre_string_list))

            summarize_logger.info("Getting bow rep")
            #get BOW representation
            bow_dict=bow.get_word_count(summarizer.summarize(url_obj.page if isinstance(url_obj.page,str) else base_util.utf_8_safe_decode(url_obj)))

            summarize_logger.info("Update count:"+str(bow_dict))

            if len(bow_dict)==0:
                raise Exception("No count available")

            #store the url bow in urlbow table
            if len(Summary.objects(url=url_obj.ref_index))==0:
                Summary(url=url_obj.url,ref_index=url_obj.ref_index,attr_map=bow_dict,short_genres=genre_string_list).save()
            else:
                print('Exists bow url number {}'.format(url_obj.ref_index))

        except Exception as ex:
            summarize_logger.error(url_obj['url']+":::"+str(ex),"C:/Users/Kevin/Desktop/GitHub/Research/Webscraper/bad_url_summarize_bow.txt")
示例#5
0
def map_urlFullText(genre_dict):


    generate_training_testing((ClassificationSource(url_fulltxt_obj.bow_index,url_fulltxt_obj.bow,
                                                    util.normalize_genre_string(url_fulltxt_obj.short_genres[0],1))
                                    for url_fulltxt_obj in URLBow_fulltxt.objects)
                              ,test_set_nums,genre_dict.keys()
                              ,train_coll_cls=TrainSet_urlFullTextBow,test_coll_cls=TestSet_urlFullTextBow)
示例#6
0
def map_urlAllGram(genre_dict):

    mapped_obj=(ClassificationSource(url_allgram_obj.ngram_index,url_allgram_obj.ngram,
                                                    util.normalize_genre_string(url_allgram_obj.short_genres[0],1))
                                    for url_allgram_obj in URLAllGram.objects)

    generate_training_testing( mapped_obj
                              ,test_set_nums,genre_dict.keys()
                              ,train_coll_cls=TrainSet_urlAllGram,test_coll_cls=TestSet_urlAllGram)
示例#7
0
    def fit(self,X,y):
        feature_logger.info("Fitting transformers for each class")
        #Get all the classes first
        genre_set=set((normalize_genre_string(g,1) for g in y))

        #stage 1
        transformer_list=[] #list of all the transformers for each class/genre
        for g in genre_set:
            feature_logger.info("Fitting transformer for {}".format(g))
            transformer_obj=copy.deepcopy(self.transformers[0])

            genre_matches=[g == normalize_genre_string(g_1,1) for g_1 in y]

            #X_match=X[np.array(genre_matches)]
            #y_match=y[np.array(genre_matches)]

            transformer_obj.fit(X,genre_matches)
            transformer_list.append((g,transformer_obj))

        #now train the actual transformer
        self.transformer=FeatureUnion(transformer_list,1)
示例#8
0
文件: util.py 项目: wangk1/research
def genre_normalizer(y,level=1,dim=2):
    """
    Utility function for automatically normalizing a vector of list or a vector of genres to level @param level.

    Note that this function is not in place, a new object is created
    :param y:
    :param level:
    :return:
    """
    if hasattr(y,"dtype") and np.issubdtype(y.dtype,np.str):
        new_y=np.array([normalize_genre_string(i,level) for i in y])

    else:
        if dim==2:
            no_rep_list=[list(set((normalize_genre_string(g,level) for g in y_list)))  for y_list in y]
        else:
            no_rep_list=list(set([normalize_genre_string(g,level)  for g in y]))

        new_y=np.array(no_rep_list)

    return new_y
示例#9
0
def calculate_adjusted_miss_rate(res_folder):
    """
    Given the path to folder containing classifier results files that have the form of ClassifierName_wrong.txt or *_right.txt.

    We adjust for those wrong examples that have 1+ class and

    :param path_of_classifier_res:
    :return:
    """
    """
    Script that reads how many wrong we have
    """

    #counter the number of instances classified as wrong, but is actually predicted
    #into one of its many genres
    swing_counter=collections.Counter()

    #get all result files that ends with _right or _wrong
    for a_result in filter(lambda x: os.path.isfile(os.path.join(res_folder,x)),os.listdir(res_folder)):
        assert isinstance(a_result,str)
        abs_result=os.path.join(res_folder,a_result)
        print(a_result)

        right=0
        wrong=0
        if a_result.find("right") > -1:
            #count the rights
            with open(abs_result) as file:
                right+=sum((1 for i in file if i.strip() != ""))

        elif a_result.find("wrong")>-1:
            wrong_res_objs=get_classification_res(abs_result)

            #grab all the genres and see if it exists
            for c,res_obj in enumerate(wrong_res_objs):
                found=False

                #grab all short genres and see if it matches
                url_bow_obj=URLBow.objects(index=res_obj.ref_id).only("short_genres")[0]

                found=res_obj.predicted in (normalize_genre_string(g,1) for g in url_bow_obj.short_genres) or found

                if found:
                    swing_counter.update([res_obj.ref_id])
                    right+=1

                else:
                    wrong+=1

        print("Total right: {}, total wrong: {}".format(right,wrong))

    print("Swing counter {}".format(str(swing_counter)))
    print("Swing counter size : {}".format(len(swing_counter)))
示例#10
0
    def is_swing_sample(self,top_x_predicted=1):
        """
        Test if the ClassificationResultInstance object is a swing instance, its predicted class is within one of its
        multiple classes. So, right predictions are automatically also swing instances. But, wrong predicted samples
        may be a swing instance

        :param: top_x_predicted: check if the top x predictions are in the class's genres. If they all are,
        :return: True or False if the sample is swing instance
        """

        #grab all short genres and see if it matches
        url_bow_obj=URLBow.objects(index=self.ref_id).only("short_genres")[0]

        return all(pred_g in (normalize_genre_string(g,self.genre_lv) for g in url_bow_obj.short_genres) for pred_g in self.predicted[:top_x_predicted])
示例#11
0
def get_genre_similarities():

    #the threshold where the two categories are similar and when they are different
    THRESHOLD_SIMILAR=0.5
    THRESHOLD_DIFF=0.2
    '''
    sim_sam_cat->very similar genres derived from the same parent
    sim_diff_cat->very similar genres derived from different parents
    diff_sim_cat->very different genres derived from the same parent
    '''

    with open("genre_similarity.txt",encoding="latin-1") as sim_txt \
        ,open("genre_similarity_similar_cat.txt",encoding="latin-1",errors="ignore",mode="a") as sim_sam_cat \
        ,open("genre_similarity_similar_diff_cat.txt",encoding="latin-1",errors="ignore",mode="a") as sim_diff_cat \
        ,open("genre_similarity_diff_similar_cat.txt",encoding="latin-1",errors="ignore",mode="a") as diff_sim_cat:

        #line format:genre1, genre2 value: num\n
        for line in sim_txt:
            split_line=line.split(" ")

            genre1=normalize_genre_string(split_line[0][:-1])
            #the :
            genre2=normalize_genre_string(split_line[1])

            sim_score=float(split_line[3][:-1])

            #If the two genres are similar or different
            if sim_score>THRESHOLD_SIMILAR:
                if has_same_parent(genre1,genre2):
                    sim_sam_cat.write("{}".format(line))
                else:
                    sim_diff_cat.write("{}".format(line))

            elif sim_score<THRESHOLD_DIFF:
                if has_same_parent(genre1,genre2):
                    diff_sim_cat.write("{}".format(line))
示例#12
0
def num_genre_per_webpage(matrix_path):
    """
    Create a box plot of how many other genres each webpage has for each genre

    Also, record the occurence of genres with each other
    :param matrix_path:
    :return:
    """

    label_matrix=unpickle_obj(matrix_path)

    genre_to_num_webpages=coll.defaultdict(lambda:[])

    for webpage_genre in label_matrix:

        normalized_genre=set([normalize_genre_string(g,1) for g in webpage_genre])

        for g in normalized_genre:
            if g in bad_genre_set:
                continue

            #if normalized_genre-{g}:
            genre_to_num_webpages[g].append(normalized_genre-{g})


    #box plot it
    genre_to_num_item_iter=genre_to_num_webpages.items()

    plt.clf()
    plt.figure(1)

    plt.xticks([i for i in range(0,len(genre_to_num_item_iter))],[op.itemgetter(0)(i) for i in genre_to_num_item_iter])
    plt.yticks(range(0,6))
    plt.tick_params(axis="both",which="major",labelsize=5)

    for c,(g,counts) in enumerate(genre_to_num_item_iter):
        add_bar_plot(c,[ len(gs) for gs in counts])

    plt.savefig("C:\\Users\\Kevin\\Desktop\\GitHub\\Research\\Webscraper\\classification_res\\genre_analysis\\genre_dist.pdf")
    #print
    print(genre_to_num_webpages)
示例#13
0
def create_url_ngram():
    """
    Create ngram database of all the urls in the URLToGenre database that has original flag set to true

    :return:
    """
    #clearing db
    URLAllGram.objects().delete()

    url_model=URLTransformer()

    for c,url_bow_obj in enumerate(URLBow.objects.no_cache()):
        c%1000==0 and print("Done with {}".format(c))

        ref_index=url_bow_obj.ref_index
        url=url_bow_obj.url

        ngram=url_model.transform(url)

        URLAllGram(attr_map=ngram,ref_index=ref_index,short_genres=list(set([normalize_genre_string(genre,1)
                                                                            for genre in url_bow_obj.short_genres]))).save()
示例#14
0
def create_mrf(graph_cut_data):
    """
    Create the mrf without any edges but containing all the vertices.

    The vertices are the individual website vertex and the label vertices.

    Also, initialize the coo_matrix representation of genre word distribution

    :param graph_cut_data:
    :return mrf: The graph of vertices, label_vertices: vertex seq object of all vertexes that are labels
    """

    assert isinstance(graph_cut_data,GraphCutParams)
    mrf=igraph.Graph(directed=True)

    label_vertices=[]

    if not isinstance(graph_cut_data.y.shape[0],str):
        actual_labels=[[normalize_genre_string(g,1) for g in g_list] for g_list in graph_cut_data.y]
    else:
        actual_labels=[i for i in graph_cut_data.y]

    #create the label to vocab matrix
    graph_cut_data.genre_word_count=sp.coo_matrix((graph_cut_data.num_cluster,graph_cut_data.vocab_size),dtype=np.dtype(float))

    #create the label vertex
    for index,l in enumerate(graph_cut_data.cluster_names):
        mrf.add_vertex(l,pred_label=l,actual_label=l,is_label=True,index=-1)
        label_vertices.append(l)
        graph_cut_data.genre_to_index[l]=index

    graph_cut_data.label_vector_list=[mrf.vs.find(name=label_vertex) for label_vertex in label_vertices]

    #create the website vertices
    for index,ref_id in enumerate(graph_cut_data.ref_id):
        mrf.add_vertex(ref_id,is_label=False,index=index,actual_label=actual_labels[index])

    print("Actual Labels length: {}".format(len(actual_labels)))

    return mrf
示例#15
0
    """
    LOAD DATA, preprocess
    """

    #WARNING: REF INDEX for each individual X set must match row to row
    Xs=[]
    ys=[]
    ref_indexes_unmatched=[]
    ref_indexes=[]

    for setting in settings:
        supervised_logger.info("Loading data for {}".format(setting))
        X=unpickle_obj("pickle_dir\\{}\\X_{}_pickle".format(setting.feature_selection,setting.feature_selection))
        ref_index=unpickle_obj("pickle_dir\\{}\\refIndex_{}_pickle".format(*itertools.repeat(setting.feature_selection,2)))
        y=unpickle_obj("pickle_dir\\{}\\y_{}_pickle".format(*itertools.repeat(setting.feature_selection,2)))
        y=np.array([list(set((normalize_genre_string(g,1) for g in g_list))) for g_list in y])

        #filter out unwanted genres
        X_filtered,y_filtered,ref_index_filtered=filter_genres(X,y,ref_index,ignore_genre)
        ref_indexes_unmatched.append(ref_index_filtered)
        Xs.append(X_filtered)
        ys.append(y_filtered)

    #match refids
    supervised_logger.info("Making ref indexes match for the data sets")
    Xs,ys,ref_indexes=match_sets_based_on_ref_id(Xs,ys,ref_indexes_unmatched)

    #make sure ref indexes match
    match=True
    prev_index=ref_indexes_unmatched[0]
    for ref_index in ref_indexes_unmatched[1:]:
示例#16
0
def get_full_text_genres():
    genres_iter=(set(util.normalize_genre_string(g,1) for g in url_bow_obj.short_genres) for url_bow_obj in URLBow_fulltxt.objects)

    return itertools.chain(*genres_iter)
示例#17
0
def get_all_gram_genres():
    genres_iter=(set(util.normalize_genre_string(g,1) for g in allgram_obj.short_genres) for allgram_obj in URLAllGram.objects)

    return itertools.chain(*genres_iter)
示例#18
0
__author__ = 'Kevin'
import itertools

from analytics.classification_results.res_iterator import RightResultsIter,WrongResultsIter
from misc_scripts.assign_ref_index import global_ref_id
from misc_scripts.remove_summary_duplicates import remove_summary_duplicates_in_urlbow
from data.util import unpickle_obj
from util.base_util import normalize_genre_string

if __name__=="__main__":
    y=unpickle_obj("C:/Users/wangk1/Desktop/Research/research/pickle_dir/summary/y_summary_pickle")

    num=0
    for y_i in y:
        if len(set(normalize_genre_string(n) for n in y_i))==2:
            num+=1
    print(num)

    #remove_summary_duplicates_in_urlbow()

    #assign_ref_index_to_each_url()

    """
    path="C:\\Users\\Kevin\\Desktop\\GitHub\\Research\\Webscraper\\classification_res\\summary_chi_top4cls_10000"
    #path="C:\\Users\\Kevin\\Desktop\\GitHub\\Research\\Webscraper\\classification_res\\summary_100_chi_truncated_lsa"
    classifier="LogisticRegression"
    num_top=1

    correct=0
    wrong=0
示例#19
0
    clustering_alg=KMeans
    settings.num_clusters=list({16})
    settings.num_top_words=20 #LDA only
    settings.max_cluster_size=10000 #the cluster will be further broken up if it is greater than this size
    settings.break_up_clusters=True
    settings.spectre_clustering_limit=15000 # if the cluster is less than 15K in size, use spectre clustering instead

    #LOAD DATA
    #generate_random_sample(unpickle_obj(X_pickle_path),unpickle_obj(y_pickle_path),unpickle_obj(ref_index_pickle_path),50000)

    train_set=Training(settings,pickle_dir=PICKLE_DIR)
    train_set.load_training()

    #FEATURE SELECTION
    best_k_attr=10000
    feature_selector=Pipeline([("chi2",SelectKBest(chi2,best_k_attr))])

    clustering_logger.info("Choosing best {} features".format(best_k_attr))

    clustering_logger.debug("Normalizing to LV1")
    #NORMALIZING THE Y
    train_set.y=np.array([[normalize_genre_string(g,1) for g in r] for r in (row for row in train_set.y)])

    clusterer=Clustering()
    clusterer.feature_selection(train_set,feature_selector,fit=True)

    lda_alg=LDA(n_topics=settings.num_clusters[0],n_iter=500, random_state=1)

    lda(lda_alg,train_set,settings.num_top_words)
    #unsupervised(train_set=train_set, settings=settings,clusterer=clusterer, clustering_alg_cls=clustering_alg)
示例#20
0
from util.base_util import normalize_genre_string


pickle_dir="C:\\Users\\Kevin\\Desktop\\GitHub\\Research\\Webscraper\\pickle_dir"

if __name__=="__main__":
    mapping={"short_genres":"short_genre","index":"ref_index","bow":"attr_map"}

    #s=SourceMapper(URLBow.objects(),mapping)
    X_pickle_path=os.path.join(pickle_dir,"X_summary_pickle")
    y_pickle_path=os.path.join(pickle_dir,"y_summary_pickle")
    ref_index_pickle_path=os.path.join(pickle_dir,"refIndex_summary_pickle")

    mapping={"short_genres":"short_genre","index":"ref_index","bow":"attr_map"}


    label="summary_unsupervised_chi_top1cls_10000"

    #generate_random_sample(unpickle_obj(X_pickle_path),unpickle_obj(y_pickle_path),unpickle_obj(ref_index_pickle_path),1000)

    #load training, feature selection
    train_set=Training(label,pickle_dir=pickle_dir)
    train_set.load_training()
    train_set.y=np.array([list(set(normalize_genre_string(genre,1) for genre in g_list)) for g_list in train_set.y])
    train_set.X=chi_squared_feature_select(train_set.X,train_set.y,k_best=10000)

    params=GraphCutParams(X=train_set.X,y=train_set.y,ref_id=train_set._ref_index,
                   k_closest_neighbors=4,vocab_size=train_set._X.shape[1],num_clusters=3)

    alpha_beta_swap(params)
示例#21
0
def collect_bad_url():
    """
    Make bows of websites in the bad url list

    :return:
    """

    queue=DBQueue_old("genre_bow")

    #don't trust anything
    summarizer=Summarizer()
    bow=BagOfWords()
    short_genre_to_genre=coll.ShortGenre()
    url_to_bow=coll.URLBow()
    start_pos=queue.get()

    for c,line in enumerate(open("bad_url_summarize_bow.txt")):
        if c<start_pos:
            continue

        url=line.split(" ")[1].split(":::")[0]

        try:
            print('New url {} num: {}'.format(url,c))

            url_obj=coll.URLToGenre().select(url=url).find_one()

            if not hasattr(url_obj,"original") or not url_obj["original"]:
                print("Not original")
                continue

            #request page anyways, most of the bad pages are due to bad pagess
            data=Request().get_data(base_util.unreplace_dot_url(base_util.unreplace_dot_url(url_obj["url"])))

            if data is None:
                raise Exception('url {} No has page'.format(url))
            else:
                if not hasattr(url_obj,"page") or len(data)>len(url_obj["page"]):
                    print("updating data")
                    data=base_util.utf_8_safe_decode(data)

                    if not hasattr(url_obj,"page"):
                        #save page if the new page is significantly bigger than the old one
                        url_obj.save(page=data)

                    else:
                        url_obj.update(page=data)
                    url_obj.reload()

            if len(data) > len(url_obj.page):
                raise Exception("Inconsistency b/w data and page data")



            #url_obj=repair.genre_to_genre_data(url_obj.document)

            #get genre strings
            #register the genre with the short genres for faster retrieval
            genre_string_list=[]
            for g in url_obj.genre:
                normalized_string=base_util.normalize_genre_string(g["genre"])
                genre_string_list.append(normalized_string)
                short_genre_to_genre.select(short_genre=normalized_string).update(upsert=True,add_to_set__genres=g)

            Logger.info("Getting bow rep")
            #get BOW representation
            bow_dict=bow.get_word_count(summarizer.summarize(url_obj.page if isinstance(url_obj.page,str) else base_util.utf_8_safe_decode(url_obj)))

            if len(bow_dict)<20:
                raise Exception("Words less than 20")

            Logger.info("Update count:"+str(bow_dict))


            #store the url bow in urlbow table
            if not url_to_bow.select(url=url_obj["url"]).find_one():
                url_to_bow.create(url=url_obj["url"],bow=bow_dict,short_genres=genre_string_list)

            else:
                print('Exists bow url number {}'.format(url))

            queue.increment()
        except Exception as ex:
            Logger.error(url_obj['url']+":::"+str(ex),"C:/Users/Kevin/Desktop/GitHub/Research/Webscraper/bad_url_summarize_bow1.txt")