def test_phrase_search_force(self):
     # "May the Force be with you." should return some Star Wars quotes in the results.
     query = preprocess("May the Force be with you.")
     self.assertEqual(query, ["may", "forc"])
     query_params = {
         'query': query,
         'movie_title': '',
         'year': '',
         'actor': ''
     }
     start = time.time()
     results = query_phrase_search(query_params)
     end = time.time()
     print("May the force be with you. {:.4f} s".format(end - start))
     star_wars_sentence_ids = [
         9603737,  # Star Wars: Episode VI - Return of the Jedi
         50966637,  # Star Wars: Episode VII - The Force Awakens
         14887784,  # Star Wars: Episode I - The Phantom Menace
         14886719,  # Star Wars: Episode I - The Phantom Menace
         14886561,  # Star Wars: Episode I - The Phantom Menace
         14904435,  # Star Wars: Episode III - Revenge of the Sith
         14904433,  # Star Wars: Episode III - Revenge of the Sith
         14903103,  # Star Wars: Episode II - Attack of the Clones
         14903102,  # Star Wars: Episode II - Attack of the Clones
         13503009  # Hackers
     ]
     for id in star_wars_sentence_ids:
         self.assertIn(id, results,
                       f"Sentence _id {id} should be in the results.")
示例#2
0
def preprocess_query_params(query_params):
    query_params['query'] = query_params.get('query', '')
    for param in ['movie_title', 'actor', 'keywords', 'year', 'categories']:
        query_params[param] = query_params.get(
            param, '')  # setting missing params to default empty strings

    query = query_params['query']
    search_phrase = True if query.startswith('"') and query.endswith(
        '"') else False
    # Perform phrase search if the whole query enclosed in quotes and there's at least two terms inside the quotes.
    query = preprocess(query)
    search_phrase = search_phrase and len(
        query) >= 2  # search phrase must consist of at least 2 terms
    query_params['query'] = query
    query_params['search_phrase'] = search_phrase

    return query_params
START_TOTAL_LENGTH = 0
START_COUNT = 0

mongo = MongoDB()
sentences = mongo.sentences
total_length = START_TOTAL_LENGTH
LIMIT = 100000

total_counted = START_COUNT
while True:
    ss = sentences.find({}, {
        "sentence": 1,
        "_id": 0
    },
                        skip=total_counted,
                        limit=LIMIT)
    if ss is None:
        break

    for s in ss:
        total_length += len(preprocess(s['sentence']))
        total_counted += 1

    print("Count: {}\nTotal length: {}\n".format(total_counted, total_length))
    if total_counted % LIMIT != 0:
        break

print("Finished.")

print("Average: {}".format(1.0 * total_length / total_counted))
 def test_preprocess(self):
     query = preprocess("I am your father.")
     self.assertEqual(query, ["i", "father"])
示例#5
0
try:
    while True:
        ss = sentences.find({}, {
            "sentence": 1,
            "movie_id": 1,
            "_id": 0
        },
                            skip=total_counted)
        if ss is None:
            break
        for s in ss:
            if s['movie_id'] != movie_id:
                movie_term_counts[movie_id] = movie_term_count
                movie_id = s['movie_id']
                movie_term_count = 0
            terms = set(preprocess(s['sentence']))
            movie_term_count += len(terms)
            total_counted += 1
            if total_counted % 10000 == 0:
                print(
                    f"Count: {total_counted}, Total movies: {len(movie_term_counts)}"
                )
        movie_term_counts[movie_id] = movie_term_count
        if total_counted % LIMIT != 0:
            break
except:
    pass

pickle.dump(dict(movie_term_counts),
            open(f'movie_term_counts{int(total_counted/100000)}.p', 'wb'))
print("Finished.")