Пример #1
0
def create_table_inverted_index():
    conn = sqlite3.connect('data.db')

    conn.execute("DROP TABLE IF EXISTS inverted_index")
    conn.execute(
        "CREATE TABLE inverted_index (keyword TEXT, URL TEXT, frequency REAL)")

    insert_inverted_index = "INSERT INTO inverted_index (keyword, URL, frequency) VALUES (?, ?, ?);"

    for row in conn.execute("SELECT URL, content FROM webpages"):
        URL = row[0]
        content = row[1]
        list_words = list(extractListOfWords(content))
        list_words = [stem(w) for w in list_words]
        nb_mots = len(list_words)
        occ_mot_page = {w: list_words.count(stem(w)) for w in list_words}
        nb = len(occ_mot_page)
        tf = {x: list_words.count(stem(x)) / nb for x in list_words}
        # result = []

        for keyword, frequency in tf.items():
            conn.execute(insert_inverted_index, (keyword, URL, frequency))

    conn.commit()
    print("Insertion in inverted_index is done!")
Пример #2
0
def tf_idf(word):
    conn = sqlite3.connect('data.db')
    for row in conn.execute(
            "SELECT URL, MAX(frequency) FROM inverted_index WHERE keyword = '{:s}'"
            .format(stem(word))):
        print(row)

    for row in conn.execute(
            "SELECT keyword, idf FROM inverse_document_frequency WHERE keyword = '{:s}'"
            .format(stem(word))):
        print(row)
Пример #3
0
def countFreq(L, corpus):
    L_stemmed = [stem(i) for i in L]
    cnt = Counter()
    for word in corpus:
        cnt[word] += 1

    tf = []
    for word in L_stemmed:
        if len(corpus) == 0:
            f = 0
        else:
            f = cnt[word] / len(corpus)
        tf.append((word, f))
    return tf
Пример #4
0
def tri_tfidf():
    conn = sqlite3.connect('data.db')
    query = input('Tapez une phrase: ')
    queryWords = [stem(w) for w in query.split()]
    print(queryWords)
    print(type(queryWords))
    queryWords = tuple(queryWords)
    # sum (prod)
    # prod = tf(term,doc) x idf(term)
    # term --> keyword from inverted_index / doc --> URL from inverted_index
    # idf(term) --> keyword from inverse_document_frequency

    t =  "SELECT URL, SUM(prod) AS tfidf \
			  FROM (SELECT ii.keyword, ii.URL, ii.frequency * idf.idf AS prod \
	                FROM inverted_index AS ii, inverse_document_frequency AS idf \
	                WHERE ii.keyword = idf.keyword AND ii.keyword IN {}) \
	          GROUP BY URL \
	          ORDER BY SUM(prod) DESC LIMIT 10"   \
              .format(queryWords)

    for row in conn.execute(t):
        print(row)
    conn.commit()
Пример #5
0
def tri_tfidf_pageRank():  # tf-idf multiplié par le PageRank de la page
    conn = sqlite3.connect('data.db')
    query = input('Tapez une phrase: ')
    queryWords = [stem(w) for w in query.split()]
    queryWords = tuple(queryWords)

    # prod = tf(term,doc) x idf(term)
    # term --> keyword from inverted_index / doc --> URL from inverted_index
    # idf(term) --> keyword from inverse_document_frequency
    # page_rank: p / t: tfidf
    # URL,  tf-idf x PageRank

    query = "SELECT pr.URL, t.tfidf * pr.pageRank AS score \
              FROM page_rank AS pr, (SELECT URL, SUM(prod) AS tfidf \
                				   	 FROM ( SELECT ii.keyword, ii.URL, ii.frequency * idf.idf AS prod \
                           					FROM inverted_index AS ii, inverse_document_frequency AS idf \
                           					WHERE ii.keyword = idf.keyword AND ii.keyword IN {}) AS tf_idf \
                    				 GROUP BY URL) AS t \
              WHERE pr.URL = t.URL \
              GROUP BY pr.URL \
              ORDER BY score DESC LIMIT 10".format(queryWords)
    for row in conn.execute(query):
        print(row)
        conn.commit()
Пример #6
0
        if len(corpus) == 0:
            f = 0
        else:
            f = cnt[word] / len(corpus)
        tf.append((word, f))
    return tf


#pages = df_webpages['content'].values
columns = ['URL']
columns.extend(L)
result = pd.DataFrame(columns=columns)
result['URL'] = None
for i, row in df_webpages.iterrows():
    page = row['content']
    corpus = [stem(i) for i in extractListOfWords(page)]
    tf = countFreq(L, corpus)
    temp = [row['URL']]
    temp.extend([t[1] for t in tf])
    temp = pd.Series(temp).to_frame(0).T
    temp.columns = columns
    result = result.append(temp)

idf = {}
for word in L:
    idf[word] = log(len(result) / sum(result[word] > 0))

idf = pd.DataFrame.from_dict(idf, orient='index')

conn.execute("DROP TABLE IF EXISTS inverted_index")
result.to_sql('inverted_index', conn, if_exists='replace', index=False)
Пример #7
0
#!/usr/bin/env python3

import sqlite3
from shared import stem, sortFreqDict
from collections import defaultdict
import sys

conn = sqlite3.connect('data.db')
cursor_1 = conn.cursor()
cursor_2 = conn.cursor()

# Test pour savoir si le programme est lancé avec arguments
if len(sys.argv) > 1:
    queryWords = [stem(w) for w in sys.argv[1:] if stem(w) != ""]

else:
    # on prend l'input de la requête
    query = input("Saisissez votre requête :")
    if query == "":
        # si pas de saisie, on utilise une requête type
        query = "comment multiplier des matrices"

    queryWords = [stem(w) for w in query.split() if stem(w) != ""]
print()

# compute best query solution and output them
# 2 méthodes possibles : (tf-idf simple) ou (tf-idf * pagerank)

possible_modes = ["tf-idf", "tf-idf * pagerank"]
for mode in possible_modes:
    # création d'un dictionnaire contenant la somme des points par page