Пример #1
0
def tokenize_photo(fb_owner_id, selected_friends):
    docs_all = get_docs(fb_owner_id, selected_friends, Photo, "owner_id")

    tokens_lst = defaultdict(dict)
    num_docs  = docs_all.count()
    for photo in docs_all:
        tokens = queryProcess.processLine(photo.photo_name)
        for token in tokens:
            tokens_lst[token][photo.photo_id] = tokens_lst.get(token, {}).get(photo.photo_id, 0) + 1
    return tokens_lst, num_docs
Пример #2
0
def tokenize_link(fb_owner_id, selected_friends):
    docs_all = get_docs(fb_owner_id, selected_friends, Link, "owner_id")

    tokens_lst = defaultdict(dict)
    num_docs  = docs_all.count()
    for link in docs_all:
        tokens = queryProcess.processLine(link.link_name + ' '+ link.link_description + ' '+ link.link_message)
        for token in tokens:
            tokens_lst[token][link.link_id] = tokens_lst.get(token, {}).get(link.link_id, 0) + 1
    return tokens_lst, num_docs
Пример #3
0
def tokenize_comment(fb_owner_id, selected_friends):
    docs_all = get_docs(fb_owner_id, selected_friends, Comment, "owner_id")

    tokens_lst = defaultdict(dict)
    num_docs  = docs_all.count()
    for comment in docs_all:
        tokens = queryProcess.processLine(comment.comment_message)
        for token in tokens:
            tokens_lst[token][comment.comment_id] = tokens_lst.get(token, {}).get(comment.comment_id, 0) + 1
    return tokens_lst, num_docs
Пример #4
0
def tokenize_status(fb_owner_id, selected_friends):
    docs_all = get_docs(fb_owner_id, selected_friends, Status, "owner_id")

    tokens_lst = defaultdict(dict)
    num_docs  = docs_all.count()
    for status in docs_all:
        tokens = queryProcess.processLine(status.status_message)
        for token in tokens:
            tokens_lst[token][status.status_id] = tokens_lst.get(token, {}).get(status.status_id, 0) + 1

    return tokens_lst, num_docs
Пример #5
0
def tokenize_post(fb_owner_id, selected_friends):
    docs_all = get_docs(fb_owner_id, selected_friends, Post, "owner_id")

    tokens_lst = defaultdict(dict)
    num_docs  = docs_all.count()
    for post in docs_all:
        #print "id", post.post_id
        #print "caption:", post.post_caption
        #print "description:", post.post_description
        #print "message:", post.post_message
        #print "story:", post.post_story
        #print "name:", post.post_name
        #print "link:", post.post_link
        tokens = queryProcess.processLine(
            post.post_caption + ' '+  post.post_description + ' '+ post.post_message + ' '+ 
            post.post_story + ' '+ post.post_name)
        for token in tokens:
            tokens_lst[token][post.post_id] = tokens_lst.get(token, {}).get(post.post_id, 0) + 1

    return tokens_lst, num_docs
Пример #6
0
def apply_search(owner_id, selected_friends, query, c_type):

    # tokens_lst is a dictionary of token and its occurrences in document
    # Each word is mapped to a list of (document number, document frequency) pair
    # for example, if token A occurs once in document number 1 and 9, and twice in 4
    # its entry in tokens_lst would be ['A': (1,1), (4,2), (9,1)]

    tokens_lst = defaultdict(dict)

    [tokens_lst, num_docs] = get_tokens(owner_id, selected_friends, c_type)

    # eliminate stopwords and stemming
    tokens_lst = queryProcess.stemmer(tokens_lst, stopwords)

    # ===== Applying weighting scheme ===== #

    # doc_freq_lst maps token to its document frequency
    doc_freq_lst = dict()
    for token, doc_list in tokens_lst.items(): 
        doc_freq_lst[token] = len(doc_list)

    weight_index = index.calcWeight(tokens_lst, num_docs)
    doc_length = index.calcDocLen(weight_index)


    # =========== Process the Query ============ #
    query_doc_no = 1

    query_tokens_lst = defaultdict(dict) 
    # tokenize the query
    tokens = queryProcess.processLine(query)

    for token in tokens:
        query_tokens_lst[token][query_doc_no] = query_tokens_lst.get(token, {}).get(query_doc_no, 0) + 1
    # stem and eliminate stopwords
    query_tokens_lst = queryProcess.stemmer(query_tokens_lst, stopwords)

    # ========== Extract Query Set ============ #
    query = query.lower()

    # doc_set maps each document to its similarity score with the query
    doc_set = dict()

    # query_set is the set of items that contain at least one term in the query

    for term in query_tokens_lst:
        for doc_no in tokens_lst[term]:
            # initialize the similarity score to 0
            if doc_no not in doc_set:
                doc_set[doc_no] = 0

    # calculate term weight and query length
    query_weight_index = index.calcQueryWeight(doc_freq_lst, query_tokens_lst, num_docs)
    query_length = index.calcDocLen(query_weight_index)
            
    # Calculate the cosine similarity

        # Accumulate the inner product
    for term in query_tokens_lst:
        for doc_no in doc_set:
            doc_set[doc_no] = doc_set[doc_no] + weight_index[term].get(doc_no, 0) * query_weight_index[term][query_doc_no]

        # Normalize using document length
    for doc_no in doc_set:

        doc_set[doc_no] = doc_set[doc_no]/(doc_length[doc_no] * query_length[query_doc_no])


    results = get_results(doc_set, c_type)


    return results