class SourcePinPipeline(object): def __init__(self, mongo_uri): self.mongo_uri = mongo_uri @classmethod def from_crawler(cls, crawler): return cls(mongo_uri=crawler.settings.get('MONGO_URI'), ) def open_spider(self, spider): self.mongo_address, self.mongo_port = self.mongo_uri.split(":") self.mmu = MemexMongoUtils(address=self.mongo_address, port=int(self.mongo_port)) def close_spider(self, spider): pass def process_item(self, item, spider): self.mmu.insert_url(**dict(item))
class SourcePinPipeline(object): def __init__(self, mongo_uri): self.mongo_uri = mongo_uri @classmethod def from_crawler(cls, crawler): return cls( mongo_uri=crawler.settings.get('MONGO_URI'), ) def open_spider(self, spider): self.mongo_address, self.mongo_port = self.mongo_uri.split(":") self.mmu = MemexMongoUtils(address = self.mongo_address, port = int(self.mongo_port)) def close_spider(self, spider): pass def process_item(self, item, spider): self.mmu.insert_url(**dict(item))
def train_and_score_mongo(): """ Rescore all items from mongo """ print "**************Training*********************" train_on_user_input() print "**************Scoring and Indexing*****************" mmu = MemexMongoUtils() docs = mmu.list_all_urls_iterator(return_html = True) ranker = Ranker.load() for doc in tqdm(docs, leave = True): try: score = ranker.score_doc(doc) except: score = 0 mmu.set_score(doc["url"], score) _score_hosts()
def _score_hosts(): mmu = MemexMongoUtils() for host_doc in mmu.list_all_hosts(): print host_doc score = mmu.get_host_score(host_doc["host"]) mmu.set_host_score(host_doc["host"], score)
def __init__(self, manager): settings = manager.settings mongo_hostname = settings.get('BACKEND_MONGO_HOSTNAME', None) mongo_port = settings.get('BACKEND_MONGO_PORT', None) if mongo_hostname is None or mongo_port is None: raise NotConfigured self.client = MongoClient(mongo_hostname, mongo_port) mongo_db = settings.get('BACKEND_MONGO_DB_NAME') mmu = MemexMongoUtils(address=mongo_hostname, port=mongo_port) mongo_collection = settings.get('BACKEND_MONGO_COLLECTION_NAME', mmu.cf_collection.name) del mmu self.db = self.client[mongo_db] self.collection = self.db[mongo_collection] self.manager = manager
def __init__(self, screenshot_dir, which_collection = "crawl-data"): self.mmu = MemexMongoUtils(which_collection = which_collection) self.screenshot_dir = screenshot_dir
class SplashGet(object): """Manually get a splash screenshot""" def __init__(self, screenshot_dir, which_collection = "crawl-data"): self.mmu = MemexMongoUtils(which_collection = which_collection) self.screenshot_dir = screenshot_dir def makedir(self, path): try: os.makedirs(path) except OSError: pass def splash_request(self, url): splash_response = requests.get('http://localhost:8050/render.json?url=%s&html=1&png=1&wait=2.0&width=640&height=480&timeout=60' % url) return splash_response def save_screenshot(self, prefix, data): png = base64.b64decode(data['png']) dirname = os.path.join(self.screenshot_dir, prefix) self.makedir(dirname) fn = os.path.join(dirname, md5(png).hexdigest() + '.png') with open(fn, 'wb') as fp: fp.write(png) return fn def process_splash_response(self, url, splash_response): data = json.loads(splash_response.text, encoding='utf8') screenshot_path = self.save_screenshot(get_domain(url), data) return screenshot_path def request_and_save(self, url): print "Getting screenshot for %s" % url splash_response = self.splash_request(url) screenshot_path = self.process_splash_response(url, splash_response) self.mmu.set_screenshot_path(url, screenshot_path) def resolve_images_by_host(self, host): url_dics = self.mmu.list_urls(host, limit=2000) for url_dic in url_dics: self.request_and_save(url_dic["url"]) def resolve_images_by_url_match(self, match_term): url_dics = self.mmu.list_all_urls() for url_dic in url_dics: #get only if it doesn't have an existing screenshot if "screenshot_path" not in url_dic: #!string matching for now, makes more sense as regex if match_term in url_dic["url"]: self.request_and_save(url_dic["url"]) def resolve_images_by_host_match(self, match_term): url_dics = self.mmu.list_all_urls() for url_dic in url_dics: #get only if it doesn't have an existing screenshot if "screenshot_path" not in url_dic: #!string matching for now, makes more sense as regex if match_term in url_dic["host"]: self.request_and_save(url_dic["url"])
def get_mdocs(interest): mmu = MemexMongoUtils() return mmu.list_all_urls_with_interest(interest, return_html=True)
def __init__(self, screenshot_dir, which_collection="crawl-data"): self.mmu = MemexMongoUtils(which_collection=which_collection) self.screenshot_dir = screenshot_dir
class SplashGet(object): """Manually get a splash screenshot""" def __init__(self, screenshot_dir, which_collection="crawl-data"): self.mmu = MemexMongoUtils(which_collection=which_collection) self.screenshot_dir = screenshot_dir def makedir(self, path): try: os.makedirs(path) except OSError: pass def splash_request(self, url): splash_response = requests.get( SPLASH_URL + '/render.json?url=%s&html=1&png=1&wait=2.0&width=640&height=480&timeout=60&images=0' % url) return splash_response def save_screenshot(self, prefix, data): png = base64.b64decode(data['png']) dirname = os.path.join(self.screenshot_dir, prefix) self.makedir(dirname) fn = os.path.join(dirname, md5(png).hexdigest() + '.png') print fn with open(fn, 'wb') as fp: fp.write(png) return fn def process_splash_response(self, url, splash_response): data = json.loads(splash_response.text, encoding='utf8') screenshot_path = self.save_screenshot(get_domain(url), data) html_rendered = data["html"] return screenshot_path, html_rendered def request_and_save(self, url): print "Getting screenshot for %s" % url splash_response = self.splash_request(url) screenshot_path, html_rendered = self.process_splash_response( url, splash_response) self.mmu.set_screenshot_path(url, screenshot_path) self.mmu.set_html_rendered(url, html_rendered) def resolve_images_by_host(self, host): url_dics = self.mmu.list_urls(host, limit=2000) for url_dic in url_dics: self.request_and_save(url_dic["url"]) def resolve_images_by_url_match(self, match_term): url_dics = self.mmu.list_all_urls() for url_dic in url_dics: #get only if it doesn't have an existing screenshot if "screenshot_path" not in url_dic: #!string matching for now, makes more sense as regex if match_term in url_dic["url"]: self.request_and_save(url_dic["url"]) def resolve_images_by_host_match(self, match_term): url_dics = self.mmu.list_all_urls() for url_dic in url_dics: #get only if it doesn't have an existing screenshot if "screenshot_path" not in url_dic: #!string matching for now, makes more sense as regex if match_term in url_dic["host"]: self.request_and_save(url_dic["url"]) def get_url_chunks(self, chunk_size): url_dics = self.mmu.list_all_urls() for i in xrange(0, len(url_dics), chunk_size): yield url_dics[i:i + chunk_size] """
def open_spider(self, spider): self.mongo_address, self.mongo_port = self.mongo_uri.split(":") self.mmu = MemexMongoUtils(address=self.mongo_address, port=int(self.mongo_port))
def get_mdocs(interest): mmu = MemexMongoUtils() return mmu.list_all_urls_with_interest(interest, return_html = True)
def open_spider(self, spider): self.mongo_address, self.mongo_port = self.mongo_uri.split(":") print self.mongo_address, self.mongo_port self.mmu = MemexMongoUtils(address = self.mongo_address, port = int(self.mongo_port))