Пример #1
0
def dfs_spider(url, word, max_pages):
    pages_to_visit = [url]
    pages_visited = {url}
    number_of_pages_visited = 0

    # pages_visited.add(url)
    found_word = False

    table_name = 'PageDetails'
    create_table(table_name)
    while number_of_pages_visited < max_pages and pages_to_visit != []:
        number_of_pages_visited += 1
        url = pages_to_visit.pop(0)

        try:
            print("---------------------------")
            print(" ")
            print(number_of_pages_visited, "Visiting:", url)
            # print(number_of_pages_visited, "Visiting:", url)
            crawler = WebCrawler()
            data, links = crawler.get_links(url)
            if data.find(word) > -1:
                num_of_occurrences = len(list(find_all_occurrences(data, word)))
                found_word = True
                words = get_words(url)
                page_details = get_page_details(url, word, num_of_occurrences, links)
                add_item(page_details, table_name)
                print(" ")
                print("Found the keyword!")
                print(" ")
                print("The most relevant keywords in the page are: ")
                print(words[:5])

            for link in links:
                if link not in pages_visited:
                    pages_visited.add(link)
                    pages_to_visit.insert(0, link)
        except Exception as e:
            print(str(e))

    print("---------------------------")
    print(" ")
    if not found_word:
        print("Keyword not found...")
Пример #2
0
  G.add_node(word)
  #urls = response(['Items'])
  # print(response)
  for url in response['Items']:
    G.add_node(url['url'])
    G.add_edge(word,url['url'],weight = url['word_count'])

  nx.draw(G,with_labels=True)
  plt.savefig("chart.png")
  plt.show()

if __name__ == '__main__':
  table_name = 'PageDetails'
  db.create_table(table_name)
  item = {}
  item['url'] = 'url1'
  item['word'] = 'word1'
  item['word_count'] = 15
  item['hyperlinks'] = ['hard', 'to', 'combine']
  db.add_item(item, table_name)

  item2 = {}
  item2['url'] = 'hard'
  item2['word'] = 'word1'
  item2['word_count'] = 16
  item2['hyperlinks'] = ['blabla', 'to', 'combine']
  db.add_item(item2, table_name)
  draw_chart("word1",2)