print('starting..')

resources_path = os.path.join(os.getcwd(), 'resources')
data_path = os.path.join(os.getcwd(), 'data')

if not os.path.isdir(resources_path):
    print('ERROR: {} not esxits'.format(resources_path))
    sys.exit(1)

if not os.path.exists(data_path):
    os.mkdir(data_path)

dataset_path = os.path.join(resources_path, 'dataset')
stopwords_file = os.path.join(resources_path, 'stopwords_en.txt')

stopwords = helpers.get_stopwords(stopwords_file)

docs = helpers.get_docs(dataset_path)

corpus = []
for doc in docs:
    with open(doc, mode='r') as f:
        text = f.read()
        words = textprocessing.preprocess_text(text, stopwords)
        bag_of_words = Counter(words)
        corpus.append(bag_of_words)

idf = helpers.compute_idf(corpus)
for doc in corpus:
    helpers.compute_weights(idf, doc)
    helpers.normalize(doc)
Exemplo n.º 2
0
def index():
    if request.method == 'POST':
        file = request.files['query_data']
        filename = secure_filename(file.filename)
        f = filename.split('.')
        if f[1] == 'jpg':
            img = Image.open(file.stream)  # PIL image
            uploaded_img_path = "static/uploaded/" + datetime.now().isoformat() + "_" + file.filename
            img.save(uploaded_img_path)

            query = fe.extract(img)
            dists = np.linalg.norm(features - query, axis=1)  # Do search
            ids = np.argsort(dists)[:30]  # Top 30 results
            scores = [(dists[id], img_paths[id]) for id in ids]

            return render_template('index.html',
                                   query_path=uploaded_img_path,
                                   scores=scores)
        else:
            query = file.read().decode("utf-8")
            docs_file = os.path.join(os.getcwd(), 'data', 'docs.pickle')
            inverted_index_file = os.path.join(
                os.getcwd(), 'data', 'inverted_index.pickle')

            stopwords_file = os.path.join(os.getcwd(), 'resources', 'stopwords_en.txt')

            # Deserialize data
            with open(docs_file, 'rb') as f:
                docs = pickle.load(f)
            with open(inverted_index_file, 'rb') as f:
                inverted_index = pickle.load(f)

            stopwords = helpers.get_stopwords(stopwords_file)

            dictionary = set(inverted_index.keys())

            # Get query from command line
            # Preprocess query

            query = textprocessing.preprocess_text(query, stopwords)
            query = [word for word in query if word in dictionary]
            query = Counter(query)

            # Compute weights for words in query
            for word, value in query.items():
                query[word] = inverted_index[word]['idf'] * (1 + math.log(value))

            helpers.normalize(query)

            scores = [[i, 0] for i in range(len(docs))]
            for word, value in query.items():
                for doc in inverted_index[word]['postings_list']:
                    index, weight = doc
                    scores[index][1] += value * weight

            scores.sort(key=lambda doc: doc[1], reverse=True)

            all_docs = []
            all_scores = []
            for index, score in enumerate(scores):
                if score[1] == 0:
                    break
                all_docs.append(docs[score[0]])
                all_scores.append(score[1])
            return render_template('docindex.html',
                     query_path=secure_filename(file.filename),
                     docs=zip(all_docs,all_scores))

    else:
        return render_template('index.html')