Exemplo n.º 1
0
 def generate():
     c = 0
     urls = list()
     pod_name = ""
     print(len(urls))
     f = open(join(dir_path, "app", "static", "pods", "urls_from_pod.csv"),
              'r',
              encoding="utf-8")
     for l in f:
         if "#Pod name" in l:
             pod_name = l.rstrip('\n').replace("#Pod name:", "")
         if len(l.rstrip('\n').split(',')) == 7:
             url, title, snippet, vector, freqs, cc = index_pod_file.parse_line(
                 l)
             if not db.session.query(Urls).filter_by(url=url).all():
                 u = Urls(url=url,
                          title=title,
                          snippet=snippet,
                          pod=pod_name,
                          vector=vector,
                          freqs=freqs,
                          cc=cc)
                 urls.append(u)
     f.close()
     if len(urls) == 0:
         print("All URLs already known.")
         yield "data:" + "no news" + "\n\n"
     else:
         for u in urls:
             db.session.add(u)
             db.session.commit()
             c += 1
             yield "data:" + str(int(c / len(urls) * 100)) + "\n\n"
             pod_from_file(pod_name)
Exemplo n.º 2
0
 def generate():
     urls, keywords = readUrls(join(dir_path, "urls_to_index.txt"))
     for c in range(len(urls)):
         mk_page_vector.compute_vectors(urls[c],keywords[c])
         pod_from_file(keywords[c])
         c+=1
         yield "data:" + str(int(c/len(urls)*100)) + "\n\n"
Exemplo n.º 3
0
 def generate():
     urls, keywords = readUrls(join(dir_path, "urls_to_index.txt"))
     for c in range(len(urls)):
         success = mk_page_vector.compute_vectors(urls[c], keywords[c])
         if success:
             pod_from_file(keywords[c])
         else:
             print("Error accessing the URL.")
         c += 1
         yield "data:" + str(int(c / len(urls) * 100)) + "\n\n"
Exemplo n.º 4
0
 def generate():
     # netloc = urlparse(url).netloc
     all_links = [url]
     stack = spider.get_links(url, 200)
     indexed = 0
     while len(stack) > 0:
         all_links.append(stack[0])
         print("Processing", stack[0])
         new_page = mk_page_vector.compute_vectors(stack[0], keyword)
         if new_page:
             stack.pop(0)
             indexed += 1
             yield "data:" + str(indexed) + "\n\n"
         else:
             stack.pop(0)
     pod_from_file(keyword)
     yield "data:" + "Finished!" + "\n\n"
Exemplo n.º 5
0
    def generate():
        urls, keywords, errors = readUrls(join(dir_path, "urls_to_index.txt"))
        if errors:
            logging.error('Some URLs could not be processed')
        if not urls or not keywords:
            logging.error('Invalid file format')
            yield "data: 0 \n\n"

        c = 0
        for url, kwd in zip(urls, keywords):
            success = mk_page_vector.compute_vectors(url, kwd)
            if success:
                pod_from_file(kwd)
            else:
                logging.error("Error accessing the URL")
            c += 1
            yield "data:" + str(int(c / len(urls) * 100)) + "\n\n"
Exemplo n.º 6
0
 def generate():
     netloc = urlparse(url).netloc
     all_links = [url]
     links = extract_links(url)
     #stack = list(set([link for link in links if urlparse(link).netloc == netloc]))
     stack = list(set([link for link in links if url in link and '#' not in link]))
     indexed = 0
     while len(stack) > 0:
         all_links.append(stack[0])
         print("Processing",stack[0])
         new_page = mk_page_vector.compute_vectors(stack[0],keyword)
         if new_page:
             new_links = extract_links(stack[0])
             #new_site_links = list(set([link for link in links if urlparse(link).netloc == netloc and link not in all_links and '#' not in link]))
             new_site_links = list(set([link for link in links if url in link and link not in all_links and '#' not in link]))
             stack.pop(0)
             stack=list(set(stack+new_site_links))
             indexed+=1
             yield "data:" + str(indexed) + "\n\n"
         else:
             stack.pop(0)
     pod_from_file(keyword)
     yield "data:" + "Finished!" + "\n\n"