def fastSearch(words): fileMap = {} for i in set(words): Google.execute( """SELECT fileID, score FROM wordfile WHERE word = "%s";""" % (fun.hashF(i))) data = Google.fetchall() for i in data: if i[0] in fileMap.keys(): fileMap[i[0]][0] += i[1] fileMap[i[0]][1] += 1 else: fileMap[i[0]] = [i[1], 1] for file, score in fileMap.items(): fileMap[file] = score[0] * score[1] results = [] for file in list(fileMap.keys())[:10]: Google.execute("""SELECT fileAdd FROM files WHERE fileID = "%s";""" % (file)) data = Google.fetchall() results.append([ "simple/articles" + data[0][0], data[0][0][7:-5].replace("_", " ").replace("~", " ") ]) return results
def countWords(file, fileMap): soup = BeautifulSoup(open(fun.rootFolder + file, encoding="latin-1"), 'html.parser') filehash = fun.hashF(file) wordMap = {} ###### THe Weighatage of different words is ## title - 100 ## h1 - 100 ## h2 - 10 ## h3 - 30 ## h4 - 20 ## h5 - 0 ## h6 - 0 ## normal text - 1 ################ for tag in [['title', 100], ['h1', 100], ['h2', 10], ['h3', 30], ['h4', 20]]: for i in soup.find_all(tag[0]): for j in i.get_text().lower().translate(fun.translator).split(): incrementValue(wordMap, j, tag[1]) for i in soup.get_text().lower().translate(fun.translator).split(): incrementValue(wordMap, i, 1) fileMap[filehash] = wordMap
def searchForSyn(words): synonyms = [] for word in words: getSynonyms(word, synonyms) s = """SELECT fileID, SUM(score) * COUNT(score) FROM wordfile WHERE word in (""" for i in words: s = s + """"%s", """ % (fun.hashF(i)) s = s[:-2] + """) GROUP BY fileID""" Google.execute(s) data = Google.fetchall() fileMap = {} for file, score in data: fileMap[file] = score s = """SELECT fileID, SUM(score) FROM wordfile WHERE word in (""" for i in set(synonyms): s = s + """"%s", """ % (fun.hashF(i)) s = s[:-2] + """) GROUP BY fileID""" Google.execute(s) data2 = Google.fetchall() for file, score in data2: if file in fileMap.keys(): fileMap[file] += score / 100 else: fileMap[file] = score / 100 results = [] for file, score in sorted(fileMap.items(), key=operator.itemgetter(1), reverse=True): Google.execute("""SELECT fileAdd FROM files WHERE fileID = "%s";""" % (file)) data = Google.fetchall() results.append([ "simple/articles" + data[0][0], data[0][0][7:-5].replace("_", " ") ]) return results
def getInsertables(files): r = [] for filename in files: filename = str(filename)[64:] ## ALL talk pages, user pages and templates are ignored if any(x in filename.lower() for x in ['talk~', 'user~', 'template~', 'wikipedia~']): continue ## Remaing files are categorized as ## 1 for articles ## 2 for Categories ## 3 for Images if 'Category~' in filename: cat = 2 elif 'Image~' in filename: cat = 3 else: cat = 1 r.append((fun.hashF(filename), filename.replace("\\", "/", 4), cat)) return r
def search(words): s = """SELECT fileID FROM wordfile WHERE word in (""" for i in set(words): s = s + """"%s", """ % (fun.hashF(i)) s = s[:-2] + """) GROUP BY fileID ORDER BY SUM(score) * COUNT(score) desc;""" Google.execute(s) data = Google.fetchall() results = [] for file in data: Google.execute("""SELECT fileAdd FROM files WHERE fileID = "%s";""" % (file[0])) data = Google.fetchall() results.append([ "simple/articles" + data[0][0], data[0][0][7:-5].replace("_", " ") ]) return results
def returnSorted(fileMap): arr = [] for file in fileMap.keys(): for word, score in fileMap[file].items(): arr.append((fun.hashF(word), file, score)) return sorted(arr)