Exemplo n.º 1
0
def run_flask():
    CrawlGlobal.context().logger.info("start flask with host: %s",
                                      CRAWLER_HOSTNAME)
    app.run(debug=DEBUG_MODE,
            host=CRAWLER_HOSTNAME,
            port=PORT,
            use_reloader=False)
Exemplo n.º 2
0
 def run(self, text):
     CrawlGlobal.context().logger.info("start running the model")
     try:
         pred = self.make_prediction(text)
         return pred
     except Exception as e:
         CrawlGlobal.context().logger.info(
             "exception thrown while running model %s", str(e))
Exemplo n.º 3
0
 def construct_model(self):
     try:
         CrawlGlobal.context().logger.info("deserailizing the model")
         self.new_vectorizer = pickle.load(open("vectorizer.pickle", "rb"))
         self.model = pickle.load(open(self.model_file, "rb"))
     except Exception as e:
         CrawlGlobal.context().logger.info("error in deserializing: %s",
                                           str(e))
Exemplo n.º 4
0
 def test_store_in_redis(self):
     CrawlGlobal.context().cache.rediscache = fakeredis.FakeStrictRedis()
     
     links = ["x","y","z"]
     storageuri ="abc"
     self.scraper.store_in_redis(storageuri,links)
     val = CrawlGlobal.context().cache.get(self.url)
     self.assertEqual(val, {"storage_uri": "abc", "child_urls": ["x", "y", "z"]})
Exemplo n.º 5
0
def crawl():
    url = flask.request.json['url']
    CrawlGlobal.context().logger.info('Crawling %s', url)
    if CrawlGlobal.context().active_thread_count.get() >= MAX_ACTIVE_THREADS:
        return flask.jsonify({'accepted': False})

    CrawlGlobal.context().active_thread_count.increment()
    crawljob = CrawlerJob(url)
    executor.submit(crawljob.execute, CRAWLER_MANAGER_ENDPOINT)
    return flask.jsonify({'accepted': True})
Exemplo n.º 6
0
    def start_scrape(self):
        url = self.base_url
        CrawlGlobal.context().logger.info('start scraping')
        key = 'crawl_pages/{}'.format(str(uuid.uuid4()))
        CrawlGlobal.context().logger.info('Generated key: %s', key)

        file_ext = self.get_extension(url)

        # scraper object is decided (FileScraper, WebScraper, BaseScraper)
        if file_ext:
            scraper = FileScraper(url, key, file_ext)
        elif not CrawlGlobal.context().is_dynamic_scrape():
            scraper = BaseScraper(url, key)
        else:
            scraper = WebScraper(url, key)

        # scrape the page
        data = scraper.do_scrape()

        #CrawlGlobal.context().logger.info(self.data)
        # store
        if self.do_store(file_ext, data):
            CrawlGlobal.context().logger.info(
                "need to store the data for url: %s", self.base_url)
            self.storage_uri = scraper.store_in_gcs(data)
        else:
            CrawlGlobal.context().logger.info(
                "not storing the data for url: %s", self.base_url)

        # get child urls
        self.links = scraper.get_links(data)
        # put in cache
        scraper.store_in_redis(self.storage_uri, self.links)
Exemplo n.º 7
0
    def do_scrape(self):
        CrawlGlobal.context().logger.info("Using Base Scraper")
        CrawlGlobal.context().logger.info("Scraping URL: {}".format(
            self.base_url))
        try:

            response = requests.get(self.base_url)
            response.raise_for_status()
            return response.content
        except Exception as e:
            return None
Exemplo n.º 8
0
def setup():
    try:
        CrawlGlobal.context().logger.info("crawler end point: %s", ENDPOINT)
        CrawlGlobal.context().logger.info("crawler manager end point: %s",
                                          CRAWLER_MANAGER_ENDPOINT)
        res = requests.post(os.path.join(CRAWLER_MANAGER_ENDPOINT,
                                         'register_crawler'),
                            json={'endpoint': ENDPOINT})
        CrawlGlobal.context().logger.info(
            "Registreed successfully with crawler manager")
        CrawlGlobal.context().set_useroptions(res.json())
    except Exception as e:
        CrawlGlobal.context().logger.info(
            'Unable to register with crawler manager: %s', str(e))
Exemplo n.º 9
0
 def store_in_redis(self, storageuri, links):
     try:
         CrawlGlobal.context().logger.info(
             'Caching storage_uri and child_urls')
         CrawlGlobal.context().cache.put(self.base_url, {
             'storage_uri': storageuri,
             'child_urls': links
         })
         CrawlGlobal.context().logger.info('Caching was successful')
     except Exception as e:
         CrawlGlobal.context().logger.error(
             'Unable to cache data for %s: %s', self.base_url, str(e))
Exemplo n.º 10
0
    def store_in_gcs(self, fpath):
        CrawlGlobal.context().logger.info('Attempting to store in GCS')
        if not fpath:
            CrawlGlobal.context().logger.info('no file to store in GCS')
            return

        try:
            storage_client = storage.Client()
            bucket = storage_client.get_bucket(os.environ['GCS_BUCKET'])
            blob = bucket.blob(self.file_name + self.file_ext)
            CrawlGlobal.context().logger.info('Got the blob in GCS')
            blob.upload_from_filename(fpath)
            blob.make_public()
            uri = blob.public_url
            CrawlGlobal.context().logger.info('uri successfully generated!')
            return uri
        except Exception as e:
            CrawlGlobal.context().logger.error(
                'Unable to store webpage for %s: %s', self.base_url, str(e))
            return ''
Exemplo n.º 11
0
    def execute(self, endpoint):

        CrawlGlobal.context().logger.info('Starting crawl thread for %s',
                                          self.base_url)
        try:
            if not self.is_cached() or CrawlGlobal.context().has_model():
                self.start_scrape()
            else:
                CrawlGlobal.context().logger.info('Url %s already cached',
                                                  self.base_url)

            # callback manager
            self.send_response_to_manager(endpoint)
        except Exception as e:
            CrawlGlobal.context().logger.info("exception: {}".format(str(e)))

        CrawlGlobal.context().active_thread_count.decrement()
Exemplo n.º 12
0
 def do_scrape(self):
     if not self.is_valid():
         return super(WebScraper,self).do_scrape()
     CrawlGlobal.context().logger.info("Using Web Scraper")
     try:
         CrawlGlobal.context().logger.info("Scraping URL: {}".format(self.base_url))    
         return CrawlGlobal.context().get_data(self.base_url)
     except Exception as e:
         CrawlGlobal.context().logger.info("error in scraping: {}".format(str(e)))
         return None
Exemplo n.º 13
0
def test_connections():
    try:
        CrawlGlobal.context().cache.ping()
        CrawlGlobal.context().logger.info('connected to redis successfully')
    except Exception as e:
        CrawlGlobal.context().logger.info('could not initialize redis: %s',
                                          str(e))
Exemplo n.º 14
0
 def do_store(self, ext, data):
     docs_all = CrawlGlobal.context().scrape_all
     docs_pdf = CrawlGlobal.context().scrape_pdf
     docs_docx = CrawlGlobal.context().scrape_docx
     if ext or not CrawlGlobal.context().has_model():
         if ((docs_all == True) or (ext == '.pdf' and docs_pdf == True)
                 or (ext == '.docx' and docs_docx == True)
                 or (docs_pdf == False and docs_docx == False
                     and docs_all == False)):
             return True
         else:
             CrawlGlobal.context().logger.info(
                 'Non model: No matching doc type')
             return False
     else:
         if ((docs_all == True) or (docs_pdf == False and docs_docx == False
                                    and docs_all == False)):
             cur_pred = CrawlGlobal.context().modelrunner.run(data)
             return cur_pred == -1 or CrawlGlobal.context().has_label(
                 cur_pred)
         else:
             CrawlGlobal.context().logger.info(
                 'Model: No matching doc type')
             return False
Exemplo n.º 15
0
def ping_crawler_manager():
    try:
        CrawlGlobal.context().logger.info(
            'pinging CRAWLER_MANAGER_ENDPOINT - %s', CRAWLER_MANAGER_ENDPOINT)
        response = requests.get(CRAWLER_MANAGER_ENDPOINT)
        response.raise_for_status()
        CrawlGlobal.context().logger.info('ping successful!')
    except Exception as e:
        CrawlGlobal.context().logger.error(
            "Could not connect to crawler manager: %s", str(e))
Exemplo n.º 16
0
 def make_prediction(self, text):
     try:
         CrawlGlobal.context().logger.info("transforming and predicting")
         vectors = self.new_vectorizer.transform([text])
         prediction = self.model.predict(vectors)
         CrawlGlobal.context().logger.info("model predicted: %d",
                                           prediction[0])
         return prediction[0]
     except Exception as e:
         CrawlGlobal.context().logger.info("error in predicting: %s",
                                           str(e))
         return -1
Exemplo n.º 17
0
def kill():
    if ENVIRONMENT == 'local':
        CrawlGlobal.context().logger.info(
            'Not killing crawler because running locally')
    else:
        CrawlGlobal.context().logger.info(
            "Will kill flask server in 3 seconds")
        kill_thread = threading.Thread(target=kill_main_thread)
        kill_thread.start()

    CrawlGlobal.context().logger.info('Kill called')
    return "ok"
Exemplo n.º 18
0
 def send_response_to_manager(self, endpoint):
     links_api = os.path.join(endpoint, 'links')
     CrawlGlobal.context().logger.info('Endpoint on Crawler manager: %s',
                                       links_api)
     try:
         CrawlGlobal.context().logger.info(
             'Sending response back to crawler manager...')
         response = requests.post(links_api,
                                  json={
                                      'main_url': self.base_url,
                                      'storage_uri': self.storage_uri,
                                      'child_urls': self.links
                                  })
         response.raise_for_status()
         CrawlGlobal.context().logger.info('Response sent successfully!')
         return response
     except Exception as e:
         CrawlGlobal.context().logger.error(
             "Could not connect to crawler manager: %s", str(e))
         return None
Exemplo n.º 19
0
 def is_cached(self):
     CrawlGlobal.context().logger.info('connecting to redis')
     if CrawlGlobal.context().cache.exists(self.base_url):
         cache_val = CrawlGlobal.context().cache.get(self.base_url)
         if cache_val != None and 'storage_uri' in cache_val and 'child_urls' in cache_val:
             self.storage_uri = cache_val['storage_uri']
             self.links = cache_val['child_urls']
             if (self.storage_uri is None):
                 CrawlGlobal.context().logger.info(
                     'Error condition. storage_uri None for cached url: %s',
                     self.base_url)
                 self.storage_uri = ''
             if (self.links is None):
                 CrawlGlobal.context().logger.info(
                     'Error condition. links None for cached url: %s',
                     self.base_url)
                 self.links = []
             return True
         else:
             return False
     return False
Exemplo n.º 20
0
    def do_scrape(self):
        try:
            CrawlGlobal.context().logger.info("Scraping URL: {}".format(
                self.base_url))
            r = requests.get(self.base_url, stream=True)
            CrawlGlobal.context().logger.info("request status: %d",
                                              r.status_code)

            tmpfile = self.file_name
            if (self.file_name.startswith('crawl_pages/')):
                tmpfile = self.file_name[len('crawl_pages/'):]

            fpath = '/tmp/' + tmpfile + self.file_ext
            CrawlGlobal.context().logger.info("file path is: %s", fpath)
            with open(fpath, 'wb') as fd:
                for chunk in r.iter_content(self.chunk_size):
                    fd.write(chunk)

            return fpath
        except Exception as e:
            CrawlGlobal.context().logger.info("error in writing file: %s",
                                              str(e))
            return None
Exemplo n.º 21
0
    def store_in_gcs(self, data):
        if data is None:
            return ''

        CrawlGlobal.context().logger.info('Attempting to store in GCS')
        try:
            storage_client = storage.Client()
            bucket = storage_client.get_bucket(os.environ['GCS_BUCKET'])
            blob = bucket.blob(self.file_name)
            blob.upload_from_string(data)
            blob.make_public()
            uri = blob.public_url
            CrawlGlobal.context().logger.info('uri successfully generated!')
            return uri
        except Exception as e:
            CrawlGlobal.context().logger.error(
                'Unable to store webpage for %s: %s', url, str(e))
            return ''
Exemplo n.º 22
0
    def get_links(self, data):
        if data is None:
            return []

        CrawlGlobal.context().logger.info('Parsing links...')
        try:
            bs_obj = BeautifulSoup(data, 'html.parser')
            links_obj = {}
            for link in bs_obj.find_all('a'):
                if 'href' in link.attrs:
                    links_obj[link.attrs['href']] = 1

            links = list(links_obj.keys())
            CrawlGlobal.context().logger.info('Found links in %s: %s',
                                              self.base_url, str(links))
            return links
        except Exception as e:
            CrawlGlobal.context().logger.error(
                "Could not list links in url: %s", str(e))
            return []
Exemplo n.º 23
0
 def is_valid(self):
     return True if CrawlGlobal.context().get_driver() else False
Exemplo n.º 24
0
 def __init__(self, base_url, key):
     CrawlGlobal.context().logger.info("instantiating web scraper")
     BaseScraper.__init__(self, base_url, key)
Exemplo n.º 25
0
 def test_is_cached(self):
     CrawlGlobal.context().cache.rediscache = fakeredis.FakeStrictRedis()
     retval = self.crawljob.is_cached()
     self.assertIs(retval, False)
Exemplo n.º 26
0
 def __init__(self, base_url, key, ext):
     BaseScraper.__init__(self, base_url, key)
     CrawlGlobal.context().logger.info("instantiating file scraper")
     self.file_ext = ext
     self.chunk_size = 2000
Exemplo n.º 27
0
 def get_extension(self, url):
     CrawlGlobal.context().logger.info('url is: %s', self.base_url)
     ext = list(
         filter(lambda x: url.lower().endswith(x), ALLOWABLE_EXTENSIONS))[0]
     CrawlGlobal.context().logger.info('extension is: %s', ext)
     return ext
Exemplo n.º 28
0
def status():
    return flask.jsonify(
        {'active_threads': CrawlGlobal.context().active_thread_count.get()})
Exemplo n.º 29
0
def kill_main_thread():
    time.sleep(3)
    CrawlGlobal.context().logger.info("Kill confirmed")
    os._exit(0)
Exemplo n.º 30
0
import signal
import sys
import threading
import time
import uuid

import crawler_context

from crawl_job import CrawlerJob
import _thread
from crawl_global import CrawlGlobal

app = flask.Flask(__name__)
app.logger.setLevel(logging.INFO)
context = crawler_context.Context(app.logger)
CrawlGlobal.set_context(context)

CRAWLER_MANAGER_ENDPOINT = os.environ.get('CRAWLER_MANAGER_ENDPOINT',
                                          'http://crawler-manager:8002')
ENVIRONMENT = os.environ.get('ENVIRONMENT', 'local')
URL = os.environ.get('URL', 'https://google.com')
DEBUG_MODE = ENVIRONMENT == 'local'
HOSTNAME = os.environ.get('JOB_IP', 'crawler')
CRAWLER_HOSTNAME = os.environ.get('CRAWLER_HOSTNAME', 'crawler')
RELEASE_DATE = os.environ.get('RELEASE_DATE', '0')
NAMESPACE = os.environ.get('NAMESPACE', 'default')
MAX_ACTIVE_THREADS = 4
PORT = 8003
ENDPOINT = 'http://{}:{}'.format(HOSTNAME, PORT)

executor = concurrent.futures.ThreadPoolExecutor(MAX_ACTIVE_THREADS)