def get_index (document_list): index = [] # As long as there are unvisited URLs (and a maximum of MAX_VISITED downloads) for document in document_list: # Take terms from the text document['terms'] = parser.extract_terms(document['text']) # Now document text is not needed anymore del document['text'] # The document ID is its position within the direct index docid = document['id'] = len(index) index.append (document) return index
def get_index (reviews_list): index = [] items = {} for i in range(len(reviews_list)): # Take terms from the text reviews_list[i]['terms'] = parser.extract_terms(reviews_list[i]['text']) # Now text is not needed anymore del reviews_list[i]['text'] #check if reviews has its own asin if not reviews_list[i]['asin']: print "Oops! I miss my ASIN" return # The document ID is its position within the direct index reviews_list[i]['id'] = len(index) index.append(reviews_list[i]) items[reviews_list[i]['id']] = reviews_list[i]['asin'] return index, items
def get_index(reviews_list): index = [] items = {} for i in range(len(reviews_list)): # Take terms from the text reviews_list[i]['terms'] = parser.extract_terms( reviews_list[i]['text']) # Now text is not needed anymore del reviews_list[i]['text'] #check if reviews has its own asin if not reviews_list[i]['asin']: print "Oops! I miss my ASIN" return # The document ID is its position within the direct index reviews_list[i]['id'] = len(index) index.append(reviews_list[i]) items[reviews_list[i]['id']] = reviews_list[i]['asin'] return index, items
print 'Computing TFIDF representations of documents in the corpus' TFIDFs = TFIDF.compute_all_TFIDFs (inverted_index, idf_threshold) # Print all cosine similarities between documents ''' similarities = [[TFIDF.cosine_similarity(d1,d2) for d2 in TFIDFs] for d1 in TFIDFs] print similarities exit() ''' # Given a query, compute its TFIDF representation print 'Computing query\'s TFIDF representation' query = 'business became meaningful' query_terms = parser.extract_terms (query) q_TFIDF = TFIDF.compute_new_TFIDF (query_terms, inverted_index, idf_threshold) # Warn if the query is empty due to a high IDF threshold if len(q_TFIDF) == 0: print '*** WARNING *** Empty query, IDf threshold too high' ''' print q_TFIDF exit() ''' ###################################################################### print 'Computing set representations of documents in the corpus' sets = jaccard.compute_all_sets (inverted_index, idf_threshold)
print "IDF threshold set at", idf_threshold, ".\n" ############################################################################### print 'Computing TFIDF representations...' TFIDFs = TFIDF.compute_all_TFIDFs(inverted_index, idf_threshold) #print "TFIDF " + str(TFIDFs) ############################################################################### target_index = None if search_text: # Given a search_text, compute its TFIDF representation print 'Computing search text\'s TFIDF representation...' search_terms = parser.extract_terms(search_text) searched_TFIDF = TFIDF.compute_new_TFIDF(search_terms, inverted_index, idf_threshold) # Warn if the search_text is empty due to an high IDF threshold if len(searched_TFIDF) == 0: print '*** WARNING *** Empty search, IDF threshold is too high!' else: # Otherwise, ask for a target item: asin = raw_input("Please enter an item ASIN (e.g. 1603112251): ") print 'Find TFIDF representation...' target_index = next((id for id, code in items.items() if code == asin), None) # Error if the targeted ASIN is not found if target_index == None: sys.exit('*** FATAL *** Item not found!')