Пример #1
0
 def generate():
     urls, keywords = readUrls(join(dir_path, "urls_to_index.txt"))
     for c in range(len(urls)):
         mk_page_vector.compute_vectors(urls[c],keywords[c])
         pod_from_file(keywords[c])
         c+=1
         yield "data:" + str(int(c/len(urls)*100)) + "\n\n"
Пример #2
0
 def generate():
     urls, keywords = readUrls(join(dir_path, "urls_to_index.txt"))
     for c in range(len(urls)):
         success = mk_page_vector.compute_vectors(urls[c], keywords[c])
         if success:
             pod_from_file(keywords[c])
         else:
             print("Error accessing the URL.")
         c += 1
         yield "data:" + str(int(c / len(urls) * 100)) + "\n\n"
Пример #3
0
 def generate():
     # netloc = urlparse(url).netloc
     all_links = [url]
     stack = spider.get_links(url, 200)
     indexed = 0
     while len(stack) > 0:
         all_links.append(stack[0])
         print("Processing", stack[0])
         new_page = mk_page_vector.compute_vectors(stack[0], keyword)
         if new_page:
             stack.pop(0)
             indexed += 1
             yield "data:" + str(indexed) + "\n\n"
         else:
             stack.pop(0)
     pod_from_file(keyword)
     yield "data:" + "Finished!" + "\n\n"
Пример #4
0
    def generate():
        urls, keywords, errors = readUrls(join(dir_path, "urls_to_index.txt"))
        if errors:
            logging.error('Some URLs could not be processed')
        if not urls or not keywords:
            logging.error('Invalid file format')
            yield "data: 0 \n\n"

        c = 0
        for url, kwd in zip(urls, keywords):
            success = mk_page_vector.compute_vectors(url, kwd)
            if success:
                pod_from_file(kwd)
            else:
                logging.error("Error accessing the URL")
            c += 1
            yield "data:" + str(int(c / len(urls) * 100)) + "\n\n"
Пример #5
0
 def generate():
     netloc = urlparse(url).netloc
     all_links = [url]
     links = extract_links(url)
     #stack = list(set([link for link in links if urlparse(link).netloc == netloc]))
     stack = list(set([link for link in links if url in link and '#' not in link]))
     indexed = 0
     while len(stack) > 0:
         all_links.append(stack[0])
         print("Processing",stack[0])
         new_page = mk_page_vector.compute_vectors(stack[0],keyword)
         if new_page:
             new_links = extract_links(stack[0])
             #new_site_links = list(set([link for link in links if urlparse(link).netloc == netloc and link not in all_links and '#' not in link]))
             new_site_links = list(set([link for link in links if url in link and link not in all_links and '#' not in link]))
             stack.pop(0)
             stack=list(set(stack+new_site_links))
             indexed+=1
             yield "data:" + str(indexed) + "\n\n"
         else:
             stack.pop(0)
     pod_from_file(keyword)
     yield "data:" + "Finished!" + "\n\n"