示例#1
0
def get_index (document_list):

	index = []
	# As long as there are unvisited URLs (and a maximum of MAX_VISITED downloads)
	for document in document_list:
		# Take terms from the text
		document['terms'] = parser.extract_terms(document['text'])
		# Now document text is not needed anymore
		del document['text']
		# The document ID is its position within the direct index
		docid = document['id'] = len(index)
		index.append (document)
	return index
示例#2
0
def get_index (reviews_list):

    index = []
    items = {}
    for i in range(len(reviews_list)):
        # Take terms from the text
        reviews_list[i]['terms'] = parser.extract_terms(reviews_list[i]['text'])
        # Now text is not needed anymore
        del reviews_list[i]['text']
        #check if reviews has its own asin
        if not reviews_list[i]['asin']:
            print "Oops!  I miss my ASIN"
            return
        # The document ID is its position within the direct index
        reviews_list[i]['id'] = len(index)
        index.append(reviews_list[i])
        items[reviews_list[i]['id']] = reviews_list[i]['asin']
    return index, items
示例#3
0
def get_index(reviews_list):

    index = []
    items = {}
    for i in range(len(reviews_list)):
        # Take terms from the text
        reviews_list[i]['terms'] = parser.extract_terms(
            reviews_list[i]['text'])
        # Now text is not needed anymore
        del reviews_list[i]['text']
        #check if reviews has its own asin
        if not reviews_list[i]['asin']:
            print "Oops!  I miss my ASIN"
            return
        # The document ID is its position within the direct index
        reviews_list[i]['id'] = len(index)
        index.append(reviews_list[i])
        items[reviews_list[i]['id']] = reviews_list[i]['asin']
    return index, items
示例#4
0
print 'Computing TFIDF representations of documents in the corpus'
TFIDFs = TFIDF.compute_all_TFIDFs (inverted_index, idf_threshold)

# Print all cosine similarities between documents
'''
similarities = [[TFIDF.cosine_similarity(d1,d2) for d2 in TFIDFs] for d1 in TFIDFs]
print similarities
exit()
'''

# Given a query, compute its TFIDF representation
print 'Computing query\'s TFIDF representation'

query = 'business became meaningful'
query_terms = parser.extract_terms (query)
q_TFIDF = TFIDF.compute_new_TFIDF (query_terms, inverted_index, idf_threshold)

# Warn if the query is empty due to a high IDF threshold
if len(q_TFIDF) == 0:
	print '*** WARNING *** Empty query, IDf threshold too high'

'''
print q_TFIDF
exit()
'''

######################################################################

print 'Computing set representations of documents in the corpus'
sets = jaccard.compute_all_sets (inverted_index, idf_threshold)
示例#5
0
    print "IDF threshold set at", idf_threshold, ".\n"

###############################################################################

print 'Computing TFIDF representations...'
TFIDFs = TFIDF.compute_all_TFIDFs(inverted_index, idf_threshold)
#print "TFIDF  " + str(TFIDFs)

###############################################################################

target_index = None

if search_text:
    # Given a search_text, compute its TFIDF representation
    print 'Computing search text\'s TFIDF representation...'
    search_terms = parser.extract_terms(search_text)
    searched_TFIDF = TFIDF.compute_new_TFIDF(search_terms, inverted_index, idf_threshold)

    # Warn if the search_text is empty due to an high IDF threshold
    if len(searched_TFIDF) == 0:
        print '*** WARNING *** Empty search, IDF threshold is too high!'
else:
    # Otherwise, ask for a target item:
    asin = raw_input("Please enter an item ASIN (e.g. 1603112251): ")

    print 'Find TFIDF representation...'
    target_index = next((id for id, code in items.items() if code == asin), None)

    # Error if the targeted ASIN is not found
    if target_index == None:
        sys.exit('*** FATAL *** Item not found!')