def dfs_spider(url, word, max_pages): pages_to_visit = [url] pages_visited = {url} number_of_pages_visited = 0 # pages_visited.add(url) found_word = False table_name = 'PageDetails' create_table(table_name) while number_of_pages_visited < max_pages and pages_to_visit != []: number_of_pages_visited += 1 url = pages_to_visit.pop(0) try: print("---------------------------") print(" ") print(number_of_pages_visited, "Visiting:", url) # print(number_of_pages_visited, "Visiting:", url) crawler = WebCrawler() data, links = crawler.get_links(url) if data.find(word) > -1: num_of_occurrences = len(list(find_all_occurrences(data, word))) found_word = True words = get_words(url) page_details = get_page_details(url, word, num_of_occurrences, links) add_item(page_details, table_name) print(" ") print("Found the keyword!") print(" ") print("The most relevant keywords in the page are: ") print(words[:5]) for link in links: if link not in pages_visited: pages_visited.add(link) pages_to_visit.insert(0, link) except Exception as e: print(str(e)) print("---------------------------") print(" ") if not found_word: print("Keyword not found...")
G.add_node(word) #urls = response(['Items']) # print(response) for url in response['Items']: G.add_node(url['url']) G.add_edge(word,url['url'],weight = url['word_count']) nx.draw(G,with_labels=True) plt.savefig("chart.png") plt.show() if __name__ == '__main__': table_name = 'PageDetails' db.create_table(table_name) item = {} item['url'] = 'url1' item['word'] = 'word1' item['word_count'] = 15 item['hyperlinks'] = ['hard', 'to', 'combine'] db.add_item(item, table_name) item2 = {} item2['url'] = 'hard' item2['word'] = 'word1' item2['word_count'] = 16 item2['hyperlinks'] = ['blabla', 'to', 'combine'] db.add_item(item2, table_name) draw_chart("word1",2)