Пример #1
0
def build_rec_process(img_dir, train=False, num_thread=1):
    rec_dir = os.path.abspath(os.path.join(img_dir, '../rec'))
    makedirs(rec_dir)
    prefix = 'train' if train else 'val'
    print('Building ImageRecord file for ' + prefix + ' ...')
    to_path = rec_dir

    # download lst file and im2rec script
    script_path = os.path.join(rec_dir, 'im2rec.py')
    script_url = 'https://raw.githubusercontent.com/apache/incubator-mxnet/master/tools/im2rec.py'
    download(script_url, script_path)

    lst_path = os.path.join(rec_dir, prefix + '.lst')
    lst_url = 'http://data.mxnet.io/models/imagenet/resnet/' + prefix + '.lst'
    download(lst_url, lst_path)

    # execution
    import sys
    cmd = [
        sys.executable, script_path, rec_dir, img_dir, '--recursive',
        '--pass-through', '--pack-label', '--num-thread',
        str(num_thread)
    ]
    subprocess.call(cmd)
    os.remove(script_path)
    os.remove(lst_path)
    print('ImageRecord file for ' + prefix + ' has been built!')
Пример #2
0
def get_resnet_file(name, root='~/.torch/models'):
    file_name = '{name}-{short_hash}'.format(name=name,
                                             short_hash=short_hash(name))
    root = os.path.expanduser(root)

    file_path = os.path.join(root, file_name + '.pth')
    sha1_hash = _model_sha1[name]
    if os.path.exists(file_path):
        if check_sha1(file_path, sha1_hash):
            return file_path
        else:
            print('Mismatch in the content of model file {} detected.' +
                  ' Downloading again.'.format(file_path))
    else:
        print('Model file {} is not found. Downloading.'.format(file_path))

    if not os.path.exists(root):
        os.makedirs(root)

    zip_file_path = os.path.join(root, file_name + '.zip')
    repo_url = os.environ.get('ENCODING_REPO', encoding_repo_url)
    if repo_url[-1] != '/':
        repo_url = repo_url + '/'
    download(_url_format.format(repo_url=repo_url, file_name=file_name),
             path=zip_file_path,
             overwrite=True)
    with zipfile.ZipFile(zip_file_path) as zf:
        zf.extractall(root)
    os.remove(zip_file_path)

    if check_sha1(file_path, sha1_hash):
        return file_path
    else:
        raise ValueError(
            'Downloaded file has different hash. Please try again.')
Пример #3
0
 def download_file(local_archive_path, url):
     if not exists(local_archive_path) or args.force:
         print(f'Downloading {local_archive_path}...')
         local_archive_tmp_path = local_archive_path + ".downloading"
         download(url, local_archive_tmp_path)
         print(f'Downloaded {local_archive_path}')
         rename(local_archive_tmp_path, local_archive_path)
Пример #4
0
def download_zhwiki():
    path = settings.ZHWIKI_PATH
    if os.path.exists(path):
        logger.info(f'zhwiki already downloaded at {path}')
        return
    url = settings.ZHWIKI_URL
    logger.info(f'zhwiki downloading from {url} ...')
    download(url, path)
    logger.info(f'zhwiki downloaded: {path}')
    def download(self, force=False):

        if os.path.exists(join(self.root_dir, 'VOC'+self.year, 'JPEGImages'))\
                and os.path.exists(join(self.root_dir, 'VOC'+self.year, 'Annotations'))\
                and os.path.exists(join(self.root_dir, 'VOC'+self.year, 'ImageSets')):
            if not force:
                print('Files already downloaded and verified')
                return
            else:
                shutil.rmtree(join(self.root_dir, 'VOC' + self.year))

        # make the dirs and start the downloads
        os.makedirs(self.root_dir, exist_ok=True)
        if self.year == '2012':
            filenames = ['VOCtrainval_11-May-2012']
            md5s = ['6cd6e144f989b92b3379bac3b3de84fd']
        elif self.year == '2011':
            filenames = ['VOCtrainval_25-May-2011']
            md5s = ['6c3384ef61512963050cb5d687e5bf1e']
        elif self.year == '2010':
            filenames = ['VOCtrainval_03-May-2010']
            md5s = ['da459979d0c395079b5c75ee67908abb']
        elif self.year == '2009':
            filenames = ['VOCtrainval_11-May-2009']
            md5s = ['59065e4b188729180974ef6572f6a212']
        elif self.year == '2008':
            filenames = ['VOCtrainval_14-Jul-2008']
            md5s = ['2629fa636546599198acfcfbfcf1904a']
        elif self.year == '2007':
            filenames = ['VOCtrainval_06-Nov-2007', 'VOCtest_06-Nov-2007']
            md5s = [
                'c52e279531787c972589f7e41ab4ae64',
                '41a8d6e12baa5ab18ee7f8f8029b9e11805b4ef1'
            ]

        for filename in filenames:
            tar_filename = filename + '.tar'
            url = join(self.download_url_prefix, 'voc' + self.year,
                       tar_filename)
            # download_url(url, self.root_dir, tar_filename, None)
            download(url, path=self.root_dir, overwrite=True)

            with tarfile.open(join(self.root_dir, tar_filename),
                              'r') as tar_file:
                tar_file.extractall(self.root_dir)

        shutil.move(
            os.path.join(self.root_dir, 'VOCdevkit', 'VOC' + self.year),
            os.path.join(self.root_dir, 'VOC' + self.year))
        shutil.rmtree(os.path.join(self.root_dir, 'VOCdevkit'))

        for filename in filenames:
            tar_filename = filename + '.tar'
            os.remove(join(self.root_dir, tar_filename))
Пример #6
0
def download_annotations(output_folder):
    print("Downloading annotations...")
    file_name = os.path.basename(annotations_url)
    target_path = os.path.join(output_folder, file_name)
    temp_path = target_path+"_"

    if os.path.isfile(target_path):
        print(f"[*] {target_path} already downloaded.")
    else:
        print(f"Downloading 241MB to \"{target_path}\"...")
        download(annotations_url, temp_path)
        shutil.move(temp_path, target_path)
Пример #7
0
 def run(self):
     count = 0
     while True:
         try:
             tbd_url = self.frontier.get_tbd_url()
             if not tbd_url:
                 self.logger.info("Frontier is empty. Stopping Crawler.")
                 break
             resp = download(tbd_url, self.config, self.logger)
             self.logger.info(
                 f"Downloaded {tbd_url}, status <{resp.status}>, "
                 f"using cache {self.config.cache_server}.")
             scraped_urls = scraper(tbd_url, resp, self.mostCommonWords,
                                    self.icsSubDomains, self.longestPage,
                                    self.similarURLs)
             for scraped_url in scraped_urls:  #For each scraped url, add only if not discovered
                 if (scraped_url not in self.discoveredURLs):
                     self.discoveredURLs.add(scraped_url)
                     self.frontier.add_url(scraped_url)
             self.frontier.mark_url_complete(tbd_url)
             time.sleep(self.config.time_delay)
             count += 1
             print("\n", count, "\n")
         except:
             print("IT BLEW UPPPPPPPP")
             pass
Пример #8
0
def download_items():
    counter = 0
    step = 100

    while True:
        PARAMS["limit"] = step
        PARAMS["offset"] = counter
        url = url_concat(BASE_URL, PARAMS)
        response = yield download(url)
        list_of_items = response.json_body.get("cargoquery", [])
        for item in list_of_items:
            # implicit
            raw_html_implicit_stat = item['title']['implicit stat text']
            sanitised_implicit_stat = sanitise_text(raw_html_implicit_stat)
            item['title']['implicit stat text'] = sanitised_implicit_stat

            # explicit
            raw_html_explicit_stat = item['title']['explicit stat text']
            sanitised_explicit_stat = sanitise_text(raw_html_explicit_stat)
            item['title']['explicit stat text'] = sanitised_explicit_stat

        # TODO: Store into elastic search

        if len(list_of_items) == 0:
            break

        counter += step
Пример #9
0
    def run(self):
        i = 0
        while True:
            tbd_url = self.frontier.get_tbd_url()
            if not tbd_url:
                self.logger.info("Frontier is empty. Stopping Crawler.")
                break
            try:
                resp = download(tbd_url, self.config, self.logger)
                self.logger.info(
                    f"Downloaded {tbd_url}, status <{resp.status}>, "
                    f"using cache {self.config.cache_server}.")
                scraped_urls = scraper(tbd_url, resp, self.state)
                for scraped_url in scraped_urls:
                    self.frontier.add_url(scraped_url)

            except HTTPError as err:
                self.logger.error(f"Downloaded {tbd_url}, hitting error {err}")

            self.frontier.mark_url_complete(tbd_url)
            if i % 1000 == 0:
                print(self.state['longest_page'])
                print_freq(self.state['word_rank'], 50)
                for domain, count in self.state['sub_domains'].items():
                    print(domain, count)
                self.frontier.print_saved()

            i += 1
Пример #10
0
    def run(self):
        while True:
            tbd_url = self.frontier.get_tbd_url()
            if not tbd_url:
                self.logger.info("Frontier is empty. Stopping Crawler.")
                break

            # Politeness. Check if diff is less than 500 miliseconds.
            current_time = int(round(time.time() * 1000))
            parsed = urlparse(tbd_url, allow_fragments=False)
            if parsed.netloc in self.time_visited:
                if current_time - self.time_visited[parsed.netloc] < 500:
                    # print("sleeping for ", (500-(current_time-time_visited[parsed.netloc])-1) * .001)
                    time.sleep(
                        ((500 -
                          (current_time - self.time_visited[parsed.netloc])) +
                         10) * .001)
            current_time = int(round(time.time() * 1000))
            self.time_visited[parsed.netloc] = current_time

            resp = download(tbd_url, self.config, self.logger)
            if resp == False:
                continue
            else:
                self.logger.info(
                    f"Downloaded {tbd_url}, status <{resp.status}>, "
                    f"using cache {self.config.cache_server}.")

                scraped_urls = scraper(tbd_url, resp)
                # print("scraped_urls: ", scraped_urls)
                for scraped_url in scraped_urls:
                    self.frontier.add_url(scraped_url)
                self.frontier.mark_url_complete(tbd_url)
                time.sleep(self.config.time_delay)
Пример #11
0
    def run(self):
        while True:

            tbd_url = self.frontier.get_tbd_url()

            if not tbd_url:
                self.logger.info("Frontier is empty. Stopping Crawler.")
                break

            delta = datetime.timedelta(seconds=.5)
            split = urlparse(tbd_url).netloc.split('.')
            #extract domain from url (does not account for toay.blah./blah/blah/)
            domain = split[-3] + '.' + split[-2] + '.' + split[-1]
            print("DOMAIN: " + domain)
            # if we've accessed tbd_url domain within 500ms then sleep
            #   sleep(500ms)
            if domain in last_seen and (
                    datetime.datetime.now() - last_seen[domain] < delta):
                print("====SLEEPING====")
                time.sleep(.5)
            # store tbh_url accessed at current time.
            last_seen[domain] = datetime.datetime.now()

            resp = download(tbd_url, self.config, self.logger)
            self.logger.info(f"Downloaded {tbd_url}, status <{resp.status}>, "
                             f"using cache {self.config.cache_server}.")
            scraped_urls = scraper(tbd_url, resp)
            for scraped_url in scraped_urls:
                self.frontier.add_url(scraped_url)
            self.frontier.mark_url_complete(tbd_url)
            time.sleep(self.config.time_delay)
Пример #12
0
    def run(self):
        while True:
            tbd_url = self.frontier.get_tbd_url()
            # if there is a url to download on the frontier
            if not tbd_url:
                self.logger.info("Frontier is empty. Stopping Crawler.")
                break
            # put the response = download into a try except, in case there is a timeout
            # and resp doesn't equal anything
            resp = download(tbd_url, self.config, self.logger)

            self.logger.info(
                f"Downloaded {tbd_url}, status <{resp.status}>, "
                f"using cache {self.config.cache_server}.")

            # after getting the response from the webpage, the function will
            # store the information in the self.frontier
            self.frontier.store_page_text_content(resp, tbd_url)

            scraped_urls = scraper(tbd_url, resp)
            for scraped_url in scraped_urls:
                self.frontier.add_url(scraped_url)
            self.frontier.mark_url_complete(tbd_url)
            time.sleep(self.config.time_delay)

        self.frontier.close_files()

        print("number of unique pages is:", unique_pages(self.frontier.discovered_urls))
        print("longest page is:", longest_page(self.frontier.site_content))
        print("fifty most common words are here:",fifty_most_common_words(self.frontier.word_frequencies))
        print(ics_subdomain_frequencies(self.frontier.discovered_urls))
Пример #13
0
    def run(self):
        while True:
            try:
                tbd_url = self.frontier.get_tbd_url()
                if not tbd_url:
                    self.logger.info("Frontier is empty. Stopping Crawler.")
                    final_report()
                    break

                resp = download(tbd_url, self.config, self.logger)
                self.logger.info(
                    f"Downloaded {tbd_url}, status <{resp.status}>, "
                    f"using cache {self.config.cache_server}.")
                scraped_urls = scraper(tbd_url, resp, self.logger)

                # Report the stats every so often
                self.sites_crawled += 1
                if self.sites_crawled >= 100:
                    self.sites_crawled = 0
                    self.logger.info(
                        "Downloaded 100 sites. Generating a report-so-far")
                    final_report()

                for scraped_url in scraped_urls:
                    self.frontier.add_url(scraped_url)
                self.frontier.mark_url_complete(tbd_url)
                time.sleep(self.config.time_delay)
            except Exception:
                # If the crawler runs into any exception, spit out the final report before re-raising the exception
                self.logger.info(
                    "Worker caught an exception. Generating final report before exit."
                )
                final_report()
                raise
Пример #14
0
    def run(self):

        # this is where we are gonna call the function to get the robot.txt
        record = Recorder()
        while True:
            tbd_url = self.frontier.get_tbd_url()

            if not tbd_url:
                self.logger.info("Frontier is empty. Stopping Crawler.")
                break

            if not self.can_fetch(tbd_url):
                continue

            resp = download(tbd_url, self.config, self.logger)
            self.logger.info(f"Downloaded {tbd_url}, status <{resp.status}>, "
                             f"using cache {self.config.cache_server}.")

            scraped_urls = scraper(tbd_url, resp)

            # adding data to recorder
            record.add_url(tbd_url)

            if not (resp.raw_response is None and is_valid(tbd_url)):
                record.add_words(resp.raw_response.content, tbd_url)

            record.save()

            for scraped_url in scraped_urls:
                self.frontier.add_url(scraped_url)
            self.frontier.mark_url_complete(tbd_url)
            time.sleep(self.config.time_delay)

        record.finish_crawl_report()
Пример #15
0
    def run(self):
        while True:
            tbd_url = self.frontier.get_tbd_url()
            if not tbd_url:
                self.logger.info("Frontier is empty. Stopping Crawler.")
                printStats()
                break
            
            #We will ignore any network exceptions and retry.
            startTime = time.time()
            resp = None
            hasFailed = False
            while resp is None:
                try:
                    resp = download(tbd_url, self.config, self.logger)
                except Exception as ex:
                    hasFailed = True
                    print(f"{ex}\nRetrying in 60 sec.")
                    time.sleep(60)
            if hasFailed:
                with open("server-outages.rtf", "a+") as err:
                    err.write(f"Server outage from: {startTime} to: {time.time()} duration: {round(time.time() - startTime)} sec.\n")

            self.logger.info(
                f"Downloaded {tbd_url}, status <{resp.status}>, "
                f"using cache {self.config.cache_server}.")
            scraped_urls = scraper(tbd_url, resp)
            if scraped_urls is not None:
                for scraped_url in scraped_urls:
                    self.frontier.add_url(scraped_url)
            self.frontier.mark_url_complete(tbd_url)
            time.sleep(self.config.time_delay)
Пример #16
0
def download_file(self, remote_path):
    try:
        filename = os.path.basename(remote_path)
        os.system("mkdir -p loot/%s/other" % str(self.addr[0]))
        local_path = "loot/%s/other/%s" % (str(self.addr[0]), filename)
        cmd = {
            'command': 'upload %s' % remote_path,
        }
        self.send_cmd(cmd)
        download(self, local_path)
        response = self.recv_output()['output']
    except Exception as e:
        return ("error", str(e))

    msg = "Downloaded %s to %s" % (filename, local_path)
    return ("success", msg)
Пример #17
0
def screenshot(self):
    try:
        os.system("mkdir -p loot/%s/screenshots" % str(self.addr[0]))
        local_path = "loot/%s/screenshots/%s.png" % (str(
            self.addr[0]), get_timestamp())
        cmd = {'command': 'screenshot'}
        self.send_cmd(cmd)
        download(self, local_path)
        assert self.recv_output()['output']
    except AssertionError:
        return (
            "error",
            "Received a false response. Screenshot may still be saved on remote host"
        )
    except Exception as e:
        return ("error", str(e))
    else:
        msg = "Screenshot saved to %s" % local_path
        return ("success", msg)
Пример #18
0
 def run(self):
     while True:
         tbd_url = self.frontier.get_tbd_url()
         report = self.report
         if not tbd_url:
             self.logger.info("Frontier is empty. Stopping Crawler.")
             break
         resp = download(tbd_url, self.config, self.logger)
         resp = download(tbd_url, self.config)
         self.logger.info(
             f"Downloaded {tbd_url}, status <{resp.status}>, "
             f"using cache {self.config.cache_server}.")
         scraped_urls = scraper(tbd_url, resp, report)
         for scraped_url in scraped_urls:
             self.frontier.add_url(scraped_url)
         self.frontier.mark_url_complete(tbd_url)
         time.sleep(self.config.time_delay)
     # print the result of report.py here
     self.report.print()
Пример #19
0
def buildDB(filename, downloadDir, rebuild=False):
    if os.path.exists(filename) and not rebuild:
        print "NCI Cancer Gene Index already SQLite database exists, not rebuilding"
        return
    initDB(filename)
    if downloadDir == None:
        downloadDir = settings.CGI_DOWNLOAD_PATH
    # Add gene-drug associations
    drugFile = download.download(settings.CGI_GENE_COMPOUND_FILE, downloadDir)
    zf = zipfile.ZipFile(drugFile, "r")
    item = os.path.basename(drugFile).split(".")[0] + ".xml"
    processGeneEntries(zf.open(item), filename)
    zf.close()
    # Add gene-disease associations
    diseaseFile = download.download(settings.CGI_GENE_DISEASE_FILE, downloadDir)
    zf = zipfile.ZipFile(diseaseFile, "r")
    item = os.path.basename(diseaseFile).split(".")[0] + ".xml"
    processGeneEntries(zf.open(item), filename)
    zf.close()
    makeCountTables(filename)
Пример #20
0
    def run(self):
        try:
            while True:
                self._lock()
                try:
                    tbd_url = self.frontier.get_tbd_url()
                    if tbd_url:
                        self.threads_in_processing.add(self.worker_id)
                    elif tbd_url is None and len(
                            self.threads_in_processing) == 0:
                        self.logger.info(
                            f"Frontier is empty. Stopping the Worker: {self.worker_id}"
                        )
                        break
                finally:
                    self._unlock()

                if tbd_url is None or tbd_url == "":
                    time.sleep(0.05)
                    continue

                self.logger.info(
                    f"Worker: {self.worker_id} Downloading: {tbd_url}")
                resp = download(tbd_url, self.config, self.logger)
                if resp.raw_response is None and resp.error.startswith(
                        "EMPTYCONTENT"):
                    self.logger.error(f"{resp.error}, status <{resp.status}>")

                self.logger.info(
                    f"Worker: {self.worker_id} Downloaded : {tbd_url}, status <{resp.status}>"
                )
                scraped_urls = scraper(tbd_url, resp)

                new_urls_added = 0
                self._lock()
                try:
                    for scraped_url in scraped_urls:
                        if (self.frontier.add_url(scraped_url)):
                            new_urls_added += 1
                    self.frontier.mark_url_complete(tbd_url)
                finally:
                    self._unlock()
                    self.threads_in_processing.remove(self.worker_id)

                self.logger.info(
                    f"Worker: {self.worker_id}, Added: {new_urls_added}, Remaining: {self.frontier.count_tbd_urls()}"
                )
        except BaseException:
            self.logger.exception(
                f"Unexpected exception in Worker: {self.worker_id}")
        finally:
            if self.worker_id in self.threads_in_processing:
                self.threads_in_processing.remove(self.worker_id)
            self.logger.info(f"Worker: {self.worker_id} Stopped")
Пример #21
0
    def _load_data(self, filename, offset):
        """
        Load the data in the given file. Automatically downloads the file
        if it does not already exist in the data_dir.

        :param filename: Name of the data-file.
        :param offset: Start offset in bytes when reading the data-file.
        :return: The data as a numpy array.
        """

        # Download the file from the internet if it does not exist locally.
        download(base_url=base_url,
                 filename=filename,
                 download_dir=self.data_dir)

        # Read the data-file.
        path = os.path.join(self.data_dir, filename)
        with gzip.open(path, 'rb') as f:
            data = np.frombuffer(f.read(), np.uint8, offset=offset)

        return data
Пример #22
0
    def robot(self, url):
        parsed = urlparse(url)
        if parsed.netloc not in self.robot_dict:
            robot_obj = robotparser.RobotFileParser()
            robot_resp = download(
                parsed.scheme + "://" + parsed.netloc + "/robots.txt",
                self.config, self.logger)
            if robot_resp.raw_response != None:
                robot_obj.parse(
                    robot_resp.raw_response.content.decode().split("\n"))
            self.robot_dict[parsed.netloc] = robot_obj

        return self.robot_dict[parsed.netloc]
Пример #23
0
 def run(self):
     self.run_command('bdist_app')
     # Get dmgbuild
     from utils.download import download
     dmgbuild = download(
       'https://github.com/morinted/dmgbuild/releases/download/v1.2.1%2Bplover/dmgbuild-1.2.1.pex',
       '548a5c3336fd30b966060b84d86faa9a697b7f94'
     )
     dmg = 'dist/%s.dmg' % PACKAGE
     # Use Apple's built-in python2 to run dmgbuild
     cmd = '/usr/bin/python %s -s osx/dmg_resources/settings.py Plover %s' % (dmgbuild, dmg)
     log.info('running %s', cmd)
     subprocess.check_call(cmd.split())
Пример #24
0
 def run(self):
     self.run_command('bdist_app')
     # Get dmgbuild
     from utils.download import download
     dmgbuild = download(
         'https://github.com/morinted/dmgbuild/releases/download/v1.2.1%2Bplover/dmgbuild-1.2.1.pex',
         '548a5c3336fd30b966060b84d86faa9a697b7f94')
     dmg = 'dist/%s.dmg' % PACKAGE
     # Use Apple's built-in python2 to run dmgbuild
     cmd = '/usr/bin/python %s -s osx/dmg_resources/settings.py Plover %s' % (
         dmgbuild, dmg)
     log.info('running %s', cmd)
     subprocess.check_call(cmd.split())
Пример #25
0
def download_missing_images(output_folder, annotations):
    print("Downloading images...")
    train, val = annotations

    missing_train = get_missing_files_from_annotations(output_folder, train)
    missing_val = get_missing_files_from_annotations(output_folder, val)

    missing = missing_train + missing_val
    count = 0
    all = len(missing)

    for file_name, url in missing: 
        show_progress(count,1, all)
        try:
            temp_path = file_name+"_"
            if not os.path.isfile(file_name):
                download(url, temp_path)
                shutil.move(temp_path, file_name)       
        except:
            print("Issue with ", file_name)
        count+=1
        show_progress(count,1, all)
Пример #26
0
 def run(self):
     stp_words = list()
     with open('stopwords.txt') as file:
         for line in file:
             line = line.strip()
             stp_words.append(line)
     spider = WebScraper(stp_words)
     while True:
         tbd_url = self.frontier.get_tbd_url()
         if not tbd_url:
             self.logger.info("Frontier is empty. Stopping Crawler.")
             with open('ReportText.txt', 'w+') as f:
                 common_dict = spider.most_common_words()
                 f.write('Unique Pages Count: ' + str(spider.get_unique_pages_count()) + '\n')
                 f.write('\n')
                 f.write('Longest Page: \n')
                 for key, value in spider.get_longest_page().items():
                     f.write(str(key) + ' -> ' + str(value) + ' words \n')
                 f.write('\n')
                 count = 0
                 f.write('50 Most Common Words: \n')
                 for item in common_dict:
                     if count == 50:
                         break
                     else:
                         f.write(str(item[0]) + ' -> ' + str(item[1]) + '\n')
                         count += 1
                 f.write('\n')
                 f.write('Subdomains in ics.uci.edu: \n')
                 for key, value in spider.get_subdomains().items():
                     f.write(str(key) + ' -> ' + str(value) + '\n')
             break
         if self.frontier.check_url_completed(tbd_url):
             print("URL Already marked complete")
             print(tbd_url)
             print("Loading next url")
             continue
         resp = download(tbd_url, self.config, self.logger)
         if resp == None:
             self.logger.info(
                 f"{tbd_url} Timeout")
             continue
         self.logger.info(
             f"Downloaded {tbd_url}, status <{resp.status}>, "
             f"using cache {self.config.cache_server}.")
         scraped_urls = spider.scraper(tbd_url, resp)
         check_robots = self.parse_robots_txt(scraped_urls)
         for scraped_url in check_robots:
             self.frontier.add_url(scraped_url)
         self.frontier.mark_url_complete(tbd_url)
         time.sleep(self.config.time_delay)
Пример #27
0
def get_pip(args=None):
    # Download `get-pip.py`.
    script = download('https://bootstrap.pypa.io/get-pip.py?test=arg',
                      '3d45cef22b043b2b333baa63abaa99544e9c031d')
    # Make sure wheels cache directory exists to avoid warning.
    if not os.path.exists(WHEELS_CACHE):
        os.makedirs(WHEELS_CACHE)
    # Install pip/wheel...
    get_pip_cmd = [sys.executable, script, '-f', WHEELS_CACHE]
    if args is not None:
        get_pip_cmd.extend(args)
    subprocess.check_call(get_pip_cmd)
    # ...and cache them for the next iteration.
    install_wheels(['--no-install', 'pip', 'wheel'])
Пример #28
0
 def run(self):
     while True:
         tbd_url = self.frontier.get_tbd_url()
         if not tbd_url:
             self.logger.info("Frontier is empty. Stopping Crawler.")
             break
         resp = download(tbd_url, self.config, self.logger)
         self.logger.info(f"Downloaded {tbd_url}, status <{resp.status}>, "
                          f"using cache {self.config.cache_server}.")
         scraped_urls, urlInfo, token_list = scraper(tbd_url, resp)
         for scraped_url in scraped_urls:
             self.frontier.add_url(scraped_url)
         self.reporter.add_words(tbd_url, token_list)
         self.frontier.mark_url_complete(tbd_url, urlInfo)
Пример #29
0
    def run(self):
        while True:
            tbd_url = self.frontier.get_tbd_url()
            if not tbd_url:
  
                print("************* REPORT ****************")
                print()
                print("Team Members:")
                print("    Kamaniya Sathish Kumar (56361951)")
                print("    Samhitha Tarra (69706915)")
                print("    Vani Anilkumar (36335618)")
                print()
                print("Number of Unique URLs:", scraper.count_unique_url)
                print()
                print("Longest URL:", scraper.longest_page)
                print("Number of Tokens in Longest URL:", scraper.num_words_longest_page)
                print()
                
                print("50 Most Common Words:")
                counter = 1
                for key, value in sorted(scraper.master_freq_dict.items(), key=lambda x: x[1], reverse = True):
                        if counter <= 50:
                            print(str(counter) + ". " + key + " (" + str(value) + ")")
                            counter = int(counter)
                            counter += 1
                        else:
                            break
                print()
                print("Subdomains in ics.uci.edu:")
                for tup, val in sorted(scraper.master_subdomain_dict.items(), key=lambda x: x[0]):
                    url_string = ""
                    url_string += tup[1] + "://" + tup[0] + ".ics.uci.edu,"
                    print(url_string, val)
                print()
                print("************* REPORT ****************")


                self.logger.info("Frontier is empty. Stopping Crawler.")
                break
            resp = download(tbd_url, self.config, self.logger)
            self.logger.info(
                f"Downloaded {tbd_url}, status <{resp.status}>, "
                f"using cache {self.config.cache_server}.")
            scraped_urls = scraper.scraper(tbd_url, resp)

            for scraped_url in scraped_urls:
                self.frontier.add_url(scraped_url)
            self.frontier.mark_url_complete(tbd_url)
            time.sleep(self.config.time_delay)
Пример #30
0
 def run(self):
     while True:
         tbd_url = self.frontier.get_tbd_url()
         if not tbd_url:
             self.logger.info("Frontier is empty. Stopping Crawler.")
             break
         resp = download(tbd_url, self.config, self.logger)
         self.logger.info(f"Downloaded {tbd_url}, status <{resp.status}>, "
                          f"using cache {self.config.cache_server}.")
         if resp.status in range(200, 300):
             scraped_urls = scraper(tbd_url, resp)
             for scraped_url in scraped_urls:
                 self.frontier.add_url(scraped_url)
         self.frontier.mark_url_complete(tbd_url)
         time.sleep(self.config.time_delay)
Пример #31
0
    def add_robot(self, base_url):

        resp = download(base_url, self.config, self.logger)

        if resp.raw_response is not None:
            robot_list = resp.raw_response.content.decode().split("\n")

        # Adds the robots.txt in a global dictionary, returning the read robot.txt
        if base_url not in self.robots:
            robots_file = RobotFileParser()
            if resp.raw_response is not None and resp.status != 404:
                robots_file.parse(robot_list)
            self.robots[base_url] = robots_file

        return self.robots[base_url]
Пример #32
0
 def run(self):
     while True:
         if len(self.myBackupList) == 0:
             #No more worker urls, search frontier
             tbd_url = self.frontier.get_tbd_url()
             if not tbd_url:
                 #if no workers are running, we want to have everyone stop running
                 checkAll = False
                 for worker in self.workers:
                     if len(worker.myBackupList) != 0:
                         #Some worker is still running
                         checkAll = True
                     if checkAll == False:
                         time.sleep(2)
                 if checkAll == False:
                     for worker in self.workers:
                         if len(worker.myBackupList) != 0:
                             #Some worker is still running! :O
                             checkAll = True
                         if checkAll != True:
                             time.sleep(2)
                 if checkAll == False:
                     break
             else:
                 wID = self.urlID(tbd_url)
                 if wID != self.worker_id:
                     #Not my url, give to someone else
                     self.workers[wID].addToMine(tbd_url)
                 else:
                     self.myBackupList.append(tbd_url)
         else:
             tbd_url = self.myBackupList.pop(
                 len(self.myBackupList) -
                 1)  #Take url belonging to this worker
             try:
                 resp = download(tbd_url, self.config, self.logger)
                 self.logger.info(
                     f"Downloaded {tbd_url}, status <{resp.status}>, "
                     f"using cache {self.config.cache_server}.")
                 scraped_urls = scraper(tbd_url, resp, self.wordCounts,
                                        self.uniqueURLs, self.uniqueFP)
                 for scraped_url in scraped_urls:
                     self.frontier.add_url(scraped_url)
             except:
                 #print("Timeout error (5 seconds):",tbd_url)
                 pass
             self.frontier.mark_url_complete(tbd_url)
             time.sleep(self.config.time_delay)
Пример #33
0
 def run(self):
     while True:
         tbd_url = self.frontier.get_tbd_url()
         if not tbd_url:
             self.logger.info("Frontier is empty. Stopping Crawler.")
             build_report(scraper.uniqueURLs, scraper.subDomains,
                          scraper.words, scraper.longest_page)
             break
         resp = download(tbd_url, self.config, self.logger)
         self.logger.info(f"Downloaded {tbd_url}, status <{resp.status}>, "
                          f"using cache {self.config.cache_server}.")
         scraped_urls = scraper.scraper(tbd_url, resp)
         for scraped_url in scraped_urls:
             self.frontier.add_url(scraped_url)
         self.frontier.mark_url_complete(tbd_url)
         time.sleep(self.config.time_delay)
Пример #34
0
def rewrite_rules(path=DEFAULT_PATH):
    """Deploys rewrite rules."""
    remote_path = os.path.join(path, ".htaccess")

    geo_config = [
        "#BEGIN RevelIP\n",
        "<IfModule mod_rewrite.c>\n" "RewriteEngine On\n",
        "RewriteBase /\n",
        "RewriteRule ^index\.php$ /geo.php [L]\n",
        "</IfModule>\n",
        "#End RevelIP\n",
    ]

    with download(remote_path) as local_file_name:
        with open(local_file_name, "r") as f:
            content = f.readlines()
        result_config = []
        skip_mode = False
        for l in content:
            if "BEGIN RevelIP" in l:
                skip_mode = True
                continue
            if "End RevelIP" in l:
                skip_mode = False
                continue
            if skip_mode:
                continue

            if "BEGIN WordPress" in l:
                result_config.extend(geo_config)
            result_config.append(l)

        with open(local_file_name, "w") as f:
            f.write("".join(result_config))

        put(local_file_name, remote_path)
        sudo("chown %s:%s %s" % (WWW_USER, WWW_USER, remote_path))
        local("cat %s" % local_file_name)
Пример #35
0
 def _download(self, url, checksum):
     from utils.download import DOWNLOADS_DIR, download
     if not self.dry_run:
         return download(url, checksum)
     return os.path.join(DOWNLOADS_DIR, os.path.basename(url))
Пример #36
0
 def run(self):
     # Download helper.
     from utils.download import download
     # Run command helper.
     def run(*args):
         if self.verbose:
             log.info('running %s', ' '.join(a for a in args))
         subprocess.check_call(args)
     # First things first: create Plover wheel.
     wheel_cmd = self.get_finalized_command('bdist_wheel')
     wheel_cmd.run()
     plover_wheel = glob.glob(os.path.join(wheel_cmd.dist_dir,
                                           wheel_cmd.wheel_dist_name)
                              + '*.whl')[0]
     # Setup embedded Python distribution.
     # Note: python35.zip is decompressed to prevent errors when 2to3
     # is used (including indirectly by setuptools `build_py` command).
     py_embedded = download('https://www.python.org/ftp/python/3.5.2/python-3.5.2-embed-win32.zip',
                            'a62675cd88736688bb87999e8b86d13ef2656312')
     dist_dir = os.path.join(wheel_cmd.dist_dir, PACKAGE + '-win32')
     data_dir = os.path.join(dist_dir, 'data')
     stdlib = os.path.join(data_dir, 'python35.zip')
     if os.path.exists(dist_dir):
         shutil.rmtree(dist_dir)
     os.makedirs(data_dir)
     for path in (py_embedded, stdlib):
         with zipfile.ZipFile(path) as zip:
             zip.extractall(data_dir)
     os.unlink(stdlib)
     dist_py = os.path.join(data_dir, 'python.exe')
     # Install pip/wheel.
     run(dist_py, '-m', 'utils.get_pip')
     # Install Plover and dependencies.
     # Note: do not use the embedded Python executable with `setup.py
     # install` to prevent setuptools from installing extra development
     # dependencies...
     run(dist_py, '-m', 'utils.install_wheels',
         '-r', 'requirements_distribution.txt')
     run(dist_py, '-m', 'utils.install_wheels',
         '--ignore-installed', '--no-deps', plover_wheel)
     # List installed packages.
     if self.verbose:
         run(dist_py, '-m', 'pip', 'list', '--format=columns')
     # Trim the fat...
     if self.trim:
         from utils.trim import trim
         trim(data_dir, 'windows/dist_blacklist.txt', verbose=self.verbose)
     # Add miscellaneous files: icon, license, ...
     for src, target_dir in (
         ('LICENSE.txt'             , '.'   ),
         ('plover/assets/plover.ico', 'data')
     ):
         dst = os.path.join(dist_dir, target_dir, os.path.basename(src))
         shutil.copyfile(src, dst)
     # Create launchers.
     for entrypoint, gui in (
         ('plover         = plover.main:main', True ),
         ('plover_console = plover.main:main', False),
     ):
         run(dist_py, '-c', textwrap.dedent(
             '''
             from pip._vendor.distlib.scripts import ScriptMaker
             sm = ScriptMaker(source_dir='{dist_dir}', target_dir='{dist_dir}')
             sm.executable = 'data\\python.exe'
             sm.variants = set(('',))
             sm.make('{entrypoint}', options={{'gui': {gui}}})
             '''.rstrip()).format(dist_dir=dist_dir,
                                  entrypoint=entrypoint,
                                  gui=gui))
     # Make distribution source-less.
     run(dist_py, '-m', 'utils.source_less',
         # Don't touch pip._vendor.distlib sources,
         # or `pip install` will not be usable...
         data_dir, '*/pip/_vendor/distlib/*',
     )
     # Zip results.
     if self.zipdir:
         from utils.zipdir import zipdir
         if self.verbose:
             log.info('zipping %s', dist_dir)
         zipdir(dist_dir)
     # Create an installer.
     if self.installer:
         installer_exe = '%s.setup.exe' % dist_dir
         # Compute install size for "Add/Remove Programs" entry.
         install_size = sum(os.path.getsize(os.path.join(dirpath, f))
                            for dirpath, dirnames, filenames
                            in os.walk(dist_dir) for f in filenames)
         run('makensis.exe', '-NOCD',
             '-Dsrcdir=' + dist_dir,
             '-Dversion=' + __version__,
             '-Dinstall_size=' + str(install_size // 1024),
             'windows/installer.nsi',
             '-XOutFile ' + installer_exe)