Пример #1
0
class SourcePinPipeline(object):
    def __init__(self, mongo_uri):
        self.mongo_uri = mongo_uri

    @classmethod
    def from_crawler(cls, crawler):
        return cls(mongo_uri=crawler.settings.get('MONGO_URI'), )

    def open_spider(self, spider):
        self.mongo_address, self.mongo_port = self.mongo_uri.split(":")
        self.mmu = MemexMongoUtils(address=self.mongo_address,
                                   port=int(self.mongo_port))

    def close_spider(self, spider):
        pass

    def process_item(self, item, spider):
        self.mmu.insert_url(**dict(item))
Пример #2
0
class SourcePinPipeline(object):

    def __init__(self, mongo_uri):
        self.mongo_uri = mongo_uri

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
        )

    def open_spider(self, spider):
        self.mongo_address, self.mongo_port = self.mongo_uri.split(":")
        self.mmu = MemexMongoUtils(address = self.mongo_address, port = int(self.mongo_port))

    def close_spider(self, spider):
        pass

    def process_item(self, item, spider):
        self.mmu.insert_url(**dict(item))
Пример #3
0
def train_and_score_mongo():
    """ Rescore all items from mongo """
    
    print "**************Training*********************"
    train_on_user_input()


    print "**************Scoring and Indexing*****************"
    mmu = MemexMongoUtils()
    docs = mmu.list_all_urls_iterator(return_html = True)

    ranker = Ranker.load()
    for doc in tqdm(docs, leave = True):
        try:
            score = ranker.score_doc(doc)
        except:
            score = 0

        mmu.set_score(doc["url"], score)

    _score_hosts()
Пример #4
0
def _score_hosts():

    mmu = MemexMongoUtils()
    for host_doc in mmu.list_all_hosts():
        print host_doc
        score = mmu.get_host_score(host_doc["host"])
        mmu.set_host_score(host_doc["host"], score)
Пример #5
0
    def __init__(self, manager):
        settings = manager.settings
        mongo_hostname = settings.get('BACKEND_MONGO_HOSTNAME', None)
        mongo_port = settings.get('BACKEND_MONGO_PORT', None)
        if mongo_hostname is None or mongo_port is None:
            raise NotConfigured
        self.client = MongoClient(mongo_hostname, mongo_port)
        mongo_db = settings.get('BACKEND_MONGO_DB_NAME')

        mmu = MemexMongoUtils(address=mongo_hostname, port=mongo_port)
        mongo_collection = settings.get('BACKEND_MONGO_COLLECTION_NAME', mmu.cf_collection.name)
        del mmu

        self.db = self.client[mongo_db]
        self.collection = self.db[mongo_collection]
        self.manager = manager
Пример #6
0
 def __init__(self, screenshot_dir, which_collection = "crawl-data"):
     self.mmu = MemexMongoUtils(which_collection = which_collection)
     self.screenshot_dir = screenshot_dir
Пример #7
0
class SplashGet(object):
    """Manually get a splash screenshot"""

    def __init__(self, screenshot_dir, which_collection = "crawl-data"):
        self.mmu = MemexMongoUtils(which_collection = which_collection)
        self.screenshot_dir = screenshot_dir

    def makedir(self, path):
        try:
            os.makedirs(path)
        except OSError:
            pass
    
    def splash_request(self, url):

        splash_response = requests.get('http://localhost:8050/render.json?url=%s&html=1&png=1&wait=2.0&width=640&height=480&timeout=60' % url)
        return splash_response

    def save_screenshot(self, prefix, data):
        png = base64.b64decode(data['png'])
        dirname = os.path.join(self.screenshot_dir, prefix)
        self.makedir(dirname)
    
        fn = os.path.join(dirname, md5(png).hexdigest() + '.png')
        with open(fn, 'wb') as fp:
            fp.write(png)
        return fn

    def process_splash_response(self, url, splash_response):
        data = json.loads(splash_response.text, encoding='utf8')
    
        screenshot_path = self.save_screenshot(get_domain(url), data)
        return screenshot_path

    def request_and_save(self, url):
        print "Getting screenshot for %s" % url
        splash_response = self.splash_request(url)
        screenshot_path = self.process_splash_response(url, splash_response)
        self.mmu.set_screenshot_path(url, screenshot_path)

    def resolve_images_by_host(self, host):
        url_dics = self.mmu.list_urls(host, limit=2000)
        for url_dic in url_dics:
            self.request_and_save(url_dic["url"])

    def resolve_images_by_url_match(self, match_term):
        url_dics = self.mmu.list_all_urls()
        for url_dic in url_dics:
            #get only if it doesn't have an existing screenshot            
            if "screenshot_path" not in url_dic:
                #!string matching for now, makes more sense as regex
                if match_term in url_dic["url"]:
                    self.request_and_save(url_dic["url"])

    def resolve_images_by_host_match(self, match_term):
        url_dics = self.mmu.list_all_urls()
        for url_dic in url_dics:
            #get only if it doesn't have an existing screenshot
            if "screenshot_path" not in url_dic:
                #!string matching for now, makes more sense as regex
                if match_term in url_dic["host"]:
                    self.request_and_save(url_dic["url"])
Пример #8
0
def get_mdocs(interest):

    mmu = MemexMongoUtils()
    return mmu.list_all_urls_with_interest(interest, return_html=True)
 def __init__(self, screenshot_dir, which_collection="crawl-data"):
     self.mmu = MemexMongoUtils(which_collection=which_collection)
     self.screenshot_dir = screenshot_dir
class SplashGet(object):
    """Manually get a splash screenshot"""
    def __init__(self, screenshot_dir, which_collection="crawl-data"):
        self.mmu = MemexMongoUtils(which_collection=which_collection)
        self.screenshot_dir = screenshot_dir

    def makedir(self, path):
        try:
            os.makedirs(path)
        except OSError:
            pass

    def splash_request(self, url):

        splash_response = requests.get(
            SPLASH_URL +
            '/render.json?url=%s&html=1&png=1&wait=2.0&width=640&height=480&timeout=60&images=0'
            % url)
        return splash_response

    def save_screenshot(self, prefix, data):
        png = base64.b64decode(data['png'])
        dirname = os.path.join(self.screenshot_dir, prefix)
        self.makedir(dirname)

        fn = os.path.join(dirname, md5(png).hexdigest() + '.png')
        print fn
        with open(fn, 'wb') as fp:
            fp.write(png)
        return fn

    def process_splash_response(self, url, splash_response):
        data = json.loads(splash_response.text, encoding='utf8')

        screenshot_path = self.save_screenshot(get_domain(url), data)
        html_rendered = data["html"]

        return screenshot_path, html_rendered

    def request_and_save(self, url):
        print "Getting screenshot for %s" % url
        splash_response = self.splash_request(url)
        screenshot_path, html_rendered = self.process_splash_response(
            url, splash_response)
        self.mmu.set_screenshot_path(url, screenshot_path)
        self.mmu.set_html_rendered(url, html_rendered)

    def resolve_images_by_host(self, host):
        url_dics = self.mmu.list_urls(host, limit=2000)
        for url_dic in url_dics:
            self.request_and_save(url_dic["url"])

    def resolve_images_by_url_match(self, match_term):
        url_dics = self.mmu.list_all_urls()
        for url_dic in url_dics:
            #get only if it doesn't have an existing screenshot
            if "screenshot_path" not in url_dic:
                #!string matching for now, makes more sense as regex
                if match_term in url_dic["url"]:
                    self.request_and_save(url_dic["url"])

    def resolve_images_by_host_match(self, match_term):
        url_dics = self.mmu.list_all_urls()
        for url_dic in url_dics:
            #get only if it doesn't have an existing screenshot
            if "screenshot_path" not in url_dic:
                #!string matching for now, makes more sense as regex
                if match_term in url_dic["host"]:
                    self.request_and_save(url_dic["url"])

    def get_url_chunks(self, chunk_size):
        url_dics = self.mmu.list_all_urls()
        for i in xrange(0, len(url_dics), chunk_size):
            yield url_dics[i:i + chunk_size]

    """
Пример #11
0
 def open_spider(self, spider):
     self.mongo_address, self.mongo_port = self.mongo_uri.split(":")
     self.mmu = MemexMongoUtils(address=self.mongo_address,
                                port=int(self.mongo_port))
Пример #12
0
def get_mdocs(interest):
    
    mmu = MemexMongoUtils()
    return mmu.list_all_urls_with_interest(interest, return_html = True)
Пример #13
0
 def open_spider(self, spider):
     self.mongo_address, self.mongo_port = self.mongo_uri.split(":")
     print self.mongo_address, self.mongo_port
     self.mmu = MemexMongoUtils(address = self.mongo_address, port = int(self.mongo_port))