예제 #1
0
def create_report(name, full_url, domain_name, nmap, robots_txt, whois):
    project_dir = ROOT_DIRS + '/' + name
    create_directory(project_dir)
    write_file(project_dir + '/full-url.txt', full_url)
    write_file(project_dir + '/domain-name.txt', domain_name)
    write_file(project_dir + '/nmap.txt', nmap)
    write_file(project_dir + '/robots-txt.txt', robots_txt)
    write_file(project_dir + '/whois.txt', whois)
예제 #2
0
def create_report(name, full_url, domain_name, nmap, robots_txt, whois):
    project_dir = ROOT_DIRS + '/' + name
    create_directory(project_dir)
    write_file(project_dir + '/full-url.txt', full_url)
    write_file(project_dir + '/domain-name.txt', domain_name)
    write_file(project_dir + '/nmap.txt', nmap)
    write_file(project_dir + '/robots-txt.txt', robots_txt)
    write_file(project_dir + '/whois.txt', whois)
 def read_crawling_status2(self, crawling_status_path):
     if not Crawl_path.debug:
         try:
             prod_count = general.file_to_list(crawling_status_path + '2')
             if len(prod_count) >= 5:
                 self.found += int(prod_count[0])
                 self.pnf += int(prod_count[1])
                 self.tag_failed += int(prod_count[2])
                 self.proxy_blocked += int(prod_count[3])
                 self.other += int(prod_count[4])
         except Exception as e:
             general.write_file('panacea_errors.txt', str(e))
예제 #4
0
def save_results(nmap_scan_results, robots_txt_file, whois_info, url):

    website_dir = ROOT_DIR + '/' + domainName.get_domain(url) + '/'

    # Directory for the website info
    general.create_directory(website_dir)

    # Generate files from the website data.
    general.write_file(website_dir + "nMap_scan.txt", nmap_scan_results)
    general.write_file(website_dir + "robots_txt_file.txt", robots_txt_file)
    general.write_file(website_dir + "whois_info.txt", whois_info)

    print("\n" + "Scan complete!!")
    print("\n" + "Results in: " + ROOT_DIR + "/" + domainName.get_domain(url))
예제 #5
0
파일: main.py 프로젝트: bhavya444/web-fetch
def create_report(name, nmap, robots_txt, whois):
    project_dir = ROOT_DIRS + '/' + name
    create_directory(project_dir)
    write_file(project_dir + '/nmap.txt', nmap)
    write_file(project_dir + '/robots-txt.txt', robots_txt)
    write_file(project_dir + '/whois.txt', whois)
    def start(self, worker_function=None, input_list=None):
        try:
            self.start_time = datetime.datetime.now()
            self.logger.info("Batch_start_time - " + str(self.start_time))
            if not Crawl_path.debug:
                self.logger.info("debug-False, creating db queue")
                cur, conn = self.ext_connect_postgre()
                db_thread = threading.Thread(target=self.store_data,
                                             name='Thread-store_data',
                                             args=[cur, conn])
                db_thread.daemon = True
                db_thread.start()
                mm_thread = threading.Thread(target=self.monitor_memory,
                                             name='Thread-monitor_memory')
                mm_thread.daemon = True
                mm_thread.start()
                # self.data_queue.put(url)
            if not os.path.isfile(self.input_crawled_file):
                self.logger.info("creating crawled file")
                general.write_file(self.input_crawled_file, '')
                f = open(self.input_crawled_file, 'w+')
                f.close()
            self.logger.info("checking input list")
            if input_list is None:
                self.logger.info("reading new inputs from file")
                self.input_url = general.read_csv(self.input_file,
                                                  skip_header=True)
            else:
                self.logger.info("reading inputs provided by user")
                self.input_url = input_list
            if str(self.property['resume_crawl']).lower() == 'off':
                self.logger.info(
                    "resume crawl off. deleting- crawling_status, pnf, proxy_blocked and tag_failed"
                )
                self.delete_file(self.input_crawled_file)
                self.delete_file(self.current_path + '\\crawling_status.pbf')
                self.delete_file(self.current_path + '\\pnf.txt')
                self.delete_file(self.current_path + '\\proxy_blocked.txt')
                self.delete_file(self.current_path + '\\tag_failed.txt')
                self.delete_file(self.current_path + '\\other_exception.txt')
            else:
                self.logger.info("resume crawl on")
                self.input_crawled_url = general.read_csv(
                    self.input_crawled_file)

            self.logger.info("creating workers")
            self.create_workers()
            self.logger.info("Initiating crawl")
            self.crawl()
            if not Crawl_path.debug:
                self.logger.info("waiting for push data to db")
                self.data_queue.join()
                cur.close()
                conn.close()

            self.end_time = datetime.datetime.now()
            time_taken = self.end_time - self.start_time
            self.logger.info("Time taken to run the batch - " +
                             str(time_taken))
            self.logger.info("Batch_end_time - " + str(self.end_time))
            print("Crawling completed successfully")
            logging.shutdown()

        except Exception as e:
            print(e)
            self.logger.error("Error in start method - " + str(e))
    def add_count(self, encoding=None):
        if self.push_data_value[threading.current_thread().name] == '':
            self.push_data_value[
                threading.current_thread().name] = 'other_exception'
        encoding = encoding if encoding is not None else str(
            Crawl_path.encoding)
        crawling_status_path = os.path.join(self.current_path,
                                            'crawling_status.pbf')
        self.crawling_status_lock.acquire()
        try:
            if self.push_data_value[
                    threading.current_thread().name] == 'found':
                self.found += 1
                if self.tag_failed_recrawl:
                    self.tag_failed -= 1
                if self.proxy_blocked_recrawl:
                    self.proxy_blocked -= 1
            elif self.push_data_value[
                    threading.current_thread().name] == 'pnf':
                if self.tag_failed_recrawl:
                    self.tag_failed -= 1
                if self.proxy_blocked_recrawl:
                    self.proxy_blocked -= 1
                self.pnf += 1
            elif self.push_data_value[
                    threading.current_thread().name] == 'tag_failed':
                if not self.tag_failed_recrawl:
                    self.tag_failed += 1
                if self.proxy_blocked_recrawl:
                    self.proxy_blocked -= 1
            elif self.push_data_value[
                    threading.current_thread().name] == 'proxy_blocked':
                if self.tag_failed_recrawl:
                    self.tag_failed -= 1
                if not self.proxy_blocked_recrawl:
                    self.proxy_blocked += 1
            elif self.push_data_value[
                    threading.current_thread().name] == 'other_exception':
                if self.tag_failed_recrawl:
                    self.tag_failed -= 1
                if self.proxy_blocked_recrawl:
                    self.proxy_blocked -= 1
                self.other += 1
            else:
                return
            if not os.path.isfile(crawling_status_path):
                self.crawling_status_first = False
                data_to_write = str(self.found) + '\n' + str(
                    self.pnf) + '\n' + str(self.tag_failed) + '\n' + str(
                        self.proxy_blocked) + '\n' + str(self.other) + '\n'
                with open(crawling_status_path, 'w') as f:
                    f.write(str(data_to_write))
                f.close()
                if not Crawl_path.debug:
                    with open(crawling_status_path + '2', 'w') as f:
                        f.write(str(data_to_write))
                    f.close()
            else:
                if self.crawling_status_first:
                    self.crawling_status_first = False
                    prod_count = []
                    try:
                        prod_count = general.file_to_list(crawling_status_path)
                    except Exception as e:
                        general.write_file('panacea_errors.txt',
                                           str(general.get_error_line(e)))
                    if len(prod_count) >= 5:
                        try:
                            self.found += int(prod_count[0])
                            self.pnf += int(prod_count[1])
                            self.tag_failed += int(prod_count[2])
                            self.proxy_blocked += int(prod_count[3])
                            self.other += int(prod_count[4])
                        except:
                            self.read_crawling_status2(crawling_status_path)
                    else:
                        self.read_crawling_status2(crawling_status_path)
                self.crawling_status_first = False
                data_to_write = str(self.found) + '\n' + str(
                    self.pnf) + '\n' + str(self.tag_failed) + '\n' + str(
                        self.proxy_blocked) + '\n' + str(self.other) + '\n'
                with open(crawling_status_path, 'w') as f:
                    f.write(str(data_to_write))
                f.close()
                if not Crawl_path.debug:
                    with open(crawling_status_path + '2', 'w') as f:
                        f.write(str(data_to_write))
                    f.close()

        except Exception as e:
            general.write_file('panacea_errors.txt', str(e))
        self.crawling_status_lock.release()
 def work(self):
     while True:
         self.property = general.read_properties(self.properties_file)
         if 'stop' in self.property:
             if self.property['stop'] == '1':
                 try:
                     if str(threading.current_thread().name
                            ) in self.crawl_path.browser:
                         browser = self.crawl_path.browser[str(
                             threading.current_thread().name)]
                         if browser['driver'].service.process:
                             general.close_chrome(browser['driver'],
                                                  browser['profile_path'])
                         del self.crawl_path.browser[str(
                             threading.current_thread().name)]
                 except Exception as e:
                     print(e)
                 self.logger.info(
                     "properties.pbf: stop is on. exausting the input queue."
                 )
                 print('properties.pbf: stop is on')
                 while not self.queue.empty():
                     try:
                         self.queue.get(False)
                     except Empty:
                         continue
                     self.queue.task_done()
                 break
         self.property_lock.acquire()
         if 'proxy_update' in self.property:
             try:
                 del self.property['proxy_update']
                 general.write_properties(self.properties_file,
                                          self.property)
                 self.proxies = general.read_proxies(self.proxy_file)
             except Exception as e:
                 print(str(e))
         self.property_lock.release()
         url = self.queue.get()
         if Crawl_path.debug:
             print(
                 str(threading.current_thread().name) +
                 " is now crawling - " + str(url))
         try:
             self.initiate(url, self.property['region'], self.proxies,
                           threading.current_thread().name)
             gc.collect()
         except Exception as e:
             try:
                 general.write_file('panacea_errors.txt',
                                    str(general.get_error_line(e)))
                 self.push_data('other_exception', [url])
             except Exception as e:
                 print(e)
                 general.write_file('panacea_errors.txt',
                                    str(general.get_error_line(e)))
                 self.logger.error(
                     "Error in work function section-1 for thread - " +
                     str(threading.current_thread().name) + " - " + str(e))
         self.add_count()
         try:
             if str(threading.current_thread().name
                    ) in self.crawl_path.browser:
                 browser = self.crawl_path.browser[str(
                     threading.current_thread().name)]
                 if browser[
                         'persistence'] == self.crawl_path.browser_persistence or self.queue.qsize(
                         ) < self.NUMBER_OF_THREADS:
                     if browser['driver'].service.process:
                         general.close_chrome(browser['driver'],
                                              browser['profile_path'])
                     del self.crawl_path.browser[str(
                         threading.current_thread().name)]
         except Exception as e:
             print(e)
             general.write_file('panacea_errors.txt',
                                str(general.get_error_line(e)))
             self.logger.error(
                 "Error in work function section-2 for thread - " +
                 str(threading.current_thread().name) + " - " + str(e))
         self.input_crawled_lock.acquire()
         general.write_csv(self.input_crawled_file, [url])
         self.input_crawled_lock.release()
         if Crawl_path.debug:
             print(
                 str(threading.current_thread().name) +
                 " has completed crawling - " + str(url))
         self.queue.task_done()