def main(query, ranker): crawler_tuple = pk.open_pickle("crawler_tuple_pages.pkl") global inverted global documents global max_freq global cosine global N global len_dict for url in crawler_tuple.keys(): N += 1 a, b = url, crawler_tuple[url][1] documents.update({a: b}) inverted_index(a, b) res = max(set(b), key=b.count) max_freq.update({a: (b.count(res))}) len_dict = {} for docno, file in documents.items(): val = tf_idf(docno, file) len_dict.update({docno: val}) #print(len_dict) if (ranker == 'cosine'): return cosine_calc(query) #print("cosine ranking") elif (ranker == 'PageRank'): return page_rank_calc(query) #print('pagerank') #print(len(main('computer','cosine'))) #print(len(main('computer','PageRank')))
def page_rank_calc(query): cleaned_query = process_query(query) pageranks = pk.open_pickle('qdpr.pkl') pr_rank = score(pageranks, cleaned_query) qdpr_rank = [ k for k, v in sorted(pr_rank.items(), reverse=True, key=lambda k: k[1]) ] #print(len(cosine_rank)) return qdpr_rank
def open_peaks(self, filename): peaks = open_pickle(self.peaks, filename) self.set_peaks(peaks)
# -*- coding: utf-8 -*- """ Created on Mon Apr 27 13:50:24 2020 @author: Sheetal Python program that calculates the tfidf for each term and computes the inlinks """ import math import pickle_functions as pk if __name__ == "__main__": word_count = pk.open_pickle('word_count.pkl') vocab = pk.open_pickle('vocab.pkl') crawler_tuple = pk.open_pickle('crawler_tuple_pages.pkl') tfidf = {} N = len(word_count) # N is the total number of webpages scarpped for url in word_count: tfidf[url] = {} for tokens in word_count[url]: tf = word_count[url][tokens] / (max( word for word in word_count[url].values())) idf = math.log((N / vocab[tokens]), 2) tfidf[url][tokens] = tf * idf #print(tfidf) pk.save_pickle('tfidf.pkl', tfidf) inlink = {}
a_tags = bs.find_all('a') for a in a_tags: try: if(re.search('.+?uic.edu',a["href"]) != None): if not any(ext in a["href"] for ext in skip_exten): parse = urlparse(a["href"]) temp_href = ((parse.netloc+parse.path).lstrip("www.").rstrip("/")) if(uic_domain in a["href"] and temp_href not in links_dict.values() and temp_href not in visited): url_queue.append(temp_href) except: continue print(page_num) if(page_num>search_limit): break except: print("Connection failed for ", url) continue web_crawler = scrape(visited,vocab) pk.save_pickle('crawler_tuple_pages.pkl',web_crawler) pk.save_pickle('word_count.pkl',word_count) pk.save_pickle('vocab.pkl',vocab) pk.save_pickle('page_content.pkl',page_content) else: web_crawler = pk.open_pickle('crawler_tuple_pages.pkl') word_count = pk.open_pickle('word_count.pkl') vocab = pk.open_pickle('vocab.pkl') page_content = pk.open_pickle('page_content.pkl')
while (count < 10): for url in tfidf: for token in tfidf[url]: s = 0 for i in inlink[url]: s += (qdpr_dict[i][token] if token in qdpr_dict[i] else 0) * pqitoj(token, i, url, tfidf) prQuery = tfidf[url][token] / sum( tfidf[i][token] if token in tfidf[i] else 0 for i in tfidf) qdpr_dict[url][token] = (1 - df) * prQuery + (df * s) count += 1 return qdpr_dict if __name__ == "__main__": crawler_tuple = pk.open_pickle("crawler_tuple_pages.pkl") tfidf = pk.open_pickle("tfidf.pkl") if os.path.exists('inlink.pkl'): inlink = pk.open_pickle("inlink.pkl") else: inlink = inlinkFunc(tfidf, crawler_tuple) pk.save_pickle("inlink.pkl", inlink) qdpr_fin = qdpr(tfidf, crawler_tuple, inlink) print(qdpr_fin) pk.save_pickle("qdpr.pkl", qdpr_fin) qr = pk.open_pickle('qdpr.pkl')