예제 #1
0
def fastSearch(words):

    fileMap = {}
    for i in set(words):
        Google.execute(
            """SELECT fileID, score FROM wordfile WHERE word = "%s";""" %
            (fun.hashF(i)))
        data = Google.fetchall()
        for i in data:
            if i[0] in fileMap.keys():
                fileMap[i[0]][0] += i[1]
                fileMap[i[0]][1] += 1
            else:
                fileMap[i[0]] = [i[1], 1]
    for file, score in fileMap.items():
        fileMap[file] = score[0] * score[1]

    results = []
    for file in list(fileMap.keys())[:10]:
        Google.execute("""SELECT fileAdd
			FROM files
			WHERE fileID = "%s";""" % (file))

        data = Google.fetchall()
        results.append([
            "simple/articles" + data[0][0],
            data[0][0][7:-5].replace("_", " ").replace("~", " ")
        ])

    return results
예제 #2
0
def countWords(file, fileMap):

    soup = BeautifulSoup(open(fun.rootFolder + file, encoding="latin-1"),
                         'html.parser')
    filehash = fun.hashF(file)

    wordMap = {}

    ###### THe Weighatage of different words is
    ## title - 100
    ## h1 - 100
    ## h2 - 10
    ## h3 - 30
    ## h4 - 20
    ## h5 - 0
    ## h6 - 0
    ## normal text - 1
    ################

    for tag in [['title', 100], ['h1', 100], ['h2', 10], ['h3', 30],
                ['h4', 20]]:
        for i in soup.find_all(tag[0]):
            for j in i.get_text().lower().translate(fun.translator).split():
                incrementValue(wordMap, j, tag[1])
    for i in soup.get_text().lower().translate(fun.translator).split():
        incrementValue(wordMap, i, 1)

    fileMap[filehash] = wordMap
예제 #3
0
def searchForSyn(words):

    synonyms = []
    for word in words:
        getSynonyms(word, synonyms)

    s = """SELECT fileID, SUM(score) * COUNT(score) FROM wordfile WHERE word in ("""
    for i in words:
        s = s + """"%s", """ % (fun.hashF(i))
    s = s[:-2] + """) GROUP BY fileID"""
    Google.execute(s)
    data = Google.fetchall()

    fileMap = {}
    for file, score in data:
        fileMap[file] = score

    s = """SELECT fileID, SUM(score) FROM wordfile WHERE word in ("""
    for i in set(synonyms):
        s = s + """"%s", """ % (fun.hashF(i))
    s = s[:-2] + """) GROUP BY fileID"""
    Google.execute(s)
    data2 = Google.fetchall()

    for file, score in data2:
        if file in fileMap.keys():
            fileMap[file] += score / 100
        else:
            fileMap[file] = score / 100

    results = []
    for file, score in sorted(fileMap.items(),
                              key=operator.itemgetter(1),
                              reverse=True):
        Google.execute("""SELECT fileAdd
			FROM files
			WHERE fileID = "%s";""" % (file))

        data = Google.fetchall()
        results.append([
            "simple/articles" + data[0][0], data[0][0][7:-5].replace("_", " ")
        ])

    return results
예제 #4
0
def getInsertables(files):
    r = []
    for filename in files:
        filename = str(filename)[64:]

        ## ALL talk pages, user pages and templates are ignored
        if any(x in filename.lower()
               for x in ['talk~', 'user~', 'template~', 'wikipedia~']):
            continue

        ## Remaing files are categorized as
        ## 1 for articles
        ## 2 for Categories
        ## 3 for Images
        if 'Category~' in filename: cat = 2
        elif 'Image~' in filename: cat = 3
        else: cat = 1

        r.append((fun.hashF(filename), filename.replace("\\", "/", 4), cat))

    return r
예제 #5
0
def search(words):

    s = """SELECT fileID FROM wordfile WHERE word in ("""
    for i in set(words):
        s = s + """"%s", """ % (fun.hashF(i))
    s = s[:-2] + """) GROUP BY fileID ORDER BY SUM(score) * COUNT(score) desc;"""

    Google.execute(s)

    data = Google.fetchall()

    results = []
    for file in data:
        Google.execute("""SELECT fileAdd
			FROM files
			WHERE fileID = "%s";""" % (file[0]))

        data = Google.fetchall()
        results.append([
            "simple/articles" + data[0][0], data[0][0][7:-5].replace("_", " ")
        ])
    return results
예제 #6
0
def returnSorted(fileMap):
    arr = []
    for file in fileMap.keys():
        for word, score in fileMap[file].items():
            arr.append((fun.hashF(word), file, score))
    return sorted(arr)