示例#1
0
def progress_crawl():
    print("Running progress crawl")
    url, keyword = readUrls(join(dir_path, "urls_to_index.txt"))
    url = url[0]
    keyword = keyword[0]

    def generate():
        # netloc = urlparse(url).netloc
        all_links = [url]
        stack = spider.get_links(url, 200)
        indexed = 0
        while len(stack) > 0:
            all_links.append(stack[0])
            print("Processing", stack[0])
            new_page = mk_page_vector.compute_vectors(stack[0], keyword)
            if new_page:
                stack.pop(0)
                indexed += 1
                yield "data:" + str(indexed) + "\n\n"
            else:
                stack.pop(0)
        pod_from_file(keyword)
        yield "data:" + "Finished!" + "\n\n"

    return Response(generate(), mimetype='text/event-stream')
示例#2
0
 def generate():
     urls, keywords = readUrls(join(dir_path, "urls_to_index.txt"))
     for c in range(len(urls)):
         mk_page_vector.compute_vectors(urls[c],keywords[c])
         pod_from_file(keywords[c])
         c+=1
         yield "data:" + str(int(c/len(urls)*100)) + "\n\n"
示例#3
0
def progress_crawl():
    print("Running progress crawl")
    url,keyword = readUrls(join(dir_path, "urls_to_index.txt"))
    url = url[0]
    keyword = keyword[0]
    def generate():
        netloc = urlparse(url).netloc
        all_links = [url]
        links = extract_links(url)
        #stack = list(set([link for link in links if urlparse(link).netloc == netloc]))
        stack = list(set([link for link in links if url in link and '#' not in link]))
        indexed = 0
        while len(stack) > 0:
            all_links.append(stack[0])
            print("Processing",stack[0])
            new_page = mk_page_vector.compute_vectors(stack[0],keyword)
            if new_page:
                new_links = extract_links(stack[0])
                #new_site_links = list(set([link for link in links if urlparse(link).netloc == netloc and link not in all_links and '#' not in link]))
                new_site_links = list(set([link for link in links if url in link and link not in all_links and '#' not in link]))
                stack.pop(0)
                stack=list(set(stack+new_site_links))
                indexed+=1
                yield "data:" + str(indexed) + "\n\n"
            else:
                stack.pop(0)
        pod_from_file(keyword)
        yield "data:" + "Finished!" + "\n\n"
    return Response(generate(), mimetype= 'text/event-stream')
示例#4
0
 def generate():
     urls, keywords = readUrls(join(dir_path, "urls_to_index.txt"))
     for c in range(len(urls)):
         success = mk_page_vector.compute_vectors(urls[c], keywords[c])
         if success:
             pod_from_file(keywords[c])
         else:
             print("Error accessing the URL.")
         c += 1
         yield "data:" + str(int(c / len(urls) * 100)) + "\n\n"
示例#5
0
    def generate():
        urls, keywords, errors = readUrls(join(dir_path, "urls_to_index.txt"))
        if errors:
            logging.error('Some URLs could not be processed')
        if not urls or not keywords:
            logging.error('Invalid file format')
            yield "data: 0 \n\n"

        c = 0
        for url, kwd in zip(urls, keywords):
            success = mk_page_vector.compute_vectors(url, kwd)
            if success:
                pod_from_file(kwd)
            else:
                logging.error("Error accessing the URL")
            c += 1
            yield "data:" + str(int(c / len(urls) * 100)) + "\n\n"