from crawler import Crawler from indexer import Indexer from query_processor import QuerryProcessor from document import Document from time import sleep if __name__ == '__main__': # sleep(5.0) # print("THREAD-TIME!") crawler = Crawler('https://www.in.gr', 20, 5, True, 'BFS') crawler.initializeCrawl() ind = Indexer(Crawler.documents) query = input("Enter your search query:") ind.add_document(Document('search_query', query)) print('Building Indexer...') ind.create_indexer() print('Calculating TF-IDFs. May take a while.') ind.calculate_scores() qp = QuerryProcessor(ind.inverted_index, len(ind.documents)) docs_with_cos_ = qp.compare_documents() docs_with_cos_ = sorted( docs_with_cos_, key=lambda x: x[1], reverse=True) # sorting based on cosine similarity scores print(f'Showing top results based on your query "{query}":') for doc in docs_with_cos_: print(doc[0].link)
frontier = Frontier([ 'http://mysql12.f4.htw-berlin.de/crawl/d01.html', 'http://mysql12.f4.htw-berlin.de/crawl/d06.html', 'http://mysql12.f4.htw-berlin.de/crawl/d08.html' ]) parser = Parser() indexer = Indexer() web_graph = Graph() for url in frontier: # get outgoing links for the graph and content for tokenization body, links_on_page = parser.parse(url) # add document to indexer indexer.add_document(url, body) # build our webgraph node = web_graph.get_node(url) if node is None: node = web_graph.add_node(url) for out_link in links_on_page: web_graph.add_edge(url, out_link) # hand links to the frontier to make sure they are all crawled frontier.add_urls(links_on_page) # for node in web_graph: # print(node) #