def test_phrase_search_force(self): # "May the Force be with you." should return some Star Wars quotes in the results. query = preprocess("May the Force be with you.") self.assertEqual(query, ["may", "forc"]) query_params = { 'query': query, 'movie_title': '', 'year': '', 'actor': '' } start = time.time() results = query_phrase_search(query_params) end = time.time() print("May the force be with you. {:.4f} s".format(end - start)) star_wars_sentence_ids = [ 9603737, # Star Wars: Episode VI - Return of the Jedi 50966637, # Star Wars: Episode VII - The Force Awakens 14887784, # Star Wars: Episode I - The Phantom Menace 14886719, # Star Wars: Episode I - The Phantom Menace 14886561, # Star Wars: Episode I - The Phantom Menace 14904435, # Star Wars: Episode III - Revenge of the Sith 14904433, # Star Wars: Episode III - Revenge of the Sith 14903103, # Star Wars: Episode II - Attack of the Clones 14903102, # Star Wars: Episode II - Attack of the Clones 13503009 # Hackers ] for id in star_wars_sentence_ids: self.assertIn(id, results, f"Sentence _id {id} should be in the results.")
def preprocess_query_params(query_params): query_params['query'] = query_params.get('query', '') for param in ['movie_title', 'actor', 'keywords', 'year', 'categories']: query_params[param] = query_params.get( param, '') # setting missing params to default empty strings query = query_params['query'] search_phrase = True if query.startswith('"') and query.endswith( '"') else False # Perform phrase search if the whole query enclosed in quotes and there's at least two terms inside the quotes. query = preprocess(query) search_phrase = search_phrase and len( query) >= 2 # search phrase must consist of at least 2 terms query_params['query'] = query query_params['search_phrase'] = search_phrase return query_params
START_TOTAL_LENGTH = 0 START_COUNT = 0 mongo = MongoDB() sentences = mongo.sentences total_length = START_TOTAL_LENGTH LIMIT = 100000 total_counted = START_COUNT while True: ss = sentences.find({}, { "sentence": 1, "_id": 0 }, skip=total_counted, limit=LIMIT) if ss is None: break for s in ss: total_length += len(preprocess(s['sentence'])) total_counted += 1 print("Count: {}\nTotal length: {}\n".format(total_counted, total_length)) if total_counted % LIMIT != 0: break print("Finished.") print("Average: {}".format(1.0 * total_length / total_counted))
def test_preprocess(self): query = preprocess("I am your father.") self.assertEqual(query, ["i", "father"])
try: while True: ss = sentences.find({}, { "sentence": 1, "movie_id": 1, "_id": 0 }, skip=total_counted) if ss is None: break for s in ss: if s['movie_id'] != movie_id: movie_term_counts[movie_id] = movie_term_count movie_id = s['movie_id'] movie_term_count = 0 terms = set(preprocess(s['sentence'])) movie_term_count += len(terms) total_counted += 1 if total_counted % 10000 == 0: print( f"Count: {total_counted}, Total movies: {len(movie_term_counts)}" ) movie_term_counts[movie_id] = movie_term_count if total_counted % LIMIT != 0: break except: pass pickle.dump(dict(movie_term_counts), open(f'movie_term_counts{int(total_counted/100000)}.p', 'wb')) print("Finished.")