Exemplos de Doraemon.storeMongodb em Python

Linguagem de programação: Python

Espaço para nome / nome do pacote: middlewares.doraemonMiddleware

Classe / Tipo: Doraemon

Método / Função: storeMongodb

Exemplos em hotexamples.com: 2

Doraemon.storeMongodb em Python - 2 exemplos encontrados. Esses são os exemplos do mundo real mais bem avaliados de middlewares.doraemonMiddleware.Doraemon.storeMongodb em Python extraídos de projetos de código aberto. Você pode avaliar os exemplos para nos ajudar a melhorar a qualidade deles.

Métodos Frequentes

Exibir Ocultar

createFilePath(8)

Doraemon(5)

isEmpty(4)

isExceedRestartInterval(4)

storeFinished(3)

readNewImageIds(2)

storeMongodb(2)

getAllHasSet(2)

getDateFromString(2)

hashSet(2)

storeTxt(1)

storeHtml(1)

readNewUrls(1)

isDuplicated(1)

isFileExists(1)

compressImage(1)

getFileSize(1)

downloadImage(1)

delKey(1)

delHashSet(1)

tar(1)

Métodos Frequentes

createFilePath (8)

Doraemon (5)

isEmpty (4)

isExceedRestartInterval (4)

storeFinished (3)

readNewImageIds (2)

storeMongodb (2)

getAllHasSet (2)

getDateFromString (2)

hashSet (2)

Métodos Frequentes

storeTxt (1)

storeHtml (1)

readNewUrls (1)

isDuplicated (1)

isFileExists (1)

compressImage (1)

getFileSize (1)

downloadImage (1)

delKey (1)

delHashSet (1)

tar (1)

Métodos Frequentes

tar (1)

Exemplo n.º 1

0

Exibir arquivo

Arquivo: top_baidu_url.py Projeto: hulu7/news

class Topbaidu(): def __init__(self): self.getSettings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.doraemon.createFilePath(self.work_path_prd2) self.doraemon.createFilePath(Settings.LOG_PATH) def getSettings(self): self.work_path_prd2 = settings_name['WORK_PATH_PRD2'] self.mongo = settings_name['MONGO_URLS'] self.name = settings_name['NAME'] self.max_pool_size = settings_name['MAX_POOL_SIZE'] self.log_path = Settings.LOG_PATH_PRD2 self.urls = settings_name['URLS'] self.restart_path = settings_name['RESTART_PATH'] self.restart_interval = settings_name['RESTART_INTERVAL'] self.today = Settings.TODAY def parse(self, response): current_url = response['response'].current_url.encode('gbk') print 'Start to parse: {0}'.format(current_url) html = etree.HTML(response['response'].page_source) href_items = html.xpath(".//*[contains(@class, 'article-item-title')]") for item in href_items: href = item.xpath("@href") valid = True if len(href) == 0: continue href_url = href[0] hasId = str(filter(str.isdigit, href_url)) if len(hasId) == 0: print 'Invalid url for no id: {0}'.format(href_url) continue for good in self.goodkeys: if valid == True: continue if good in href_url: valid = True for bad in self.badkeys: if valid == False: continue if bad in href_url: valid = False if valid: short_url_parts = re.split(r'[., /, _]', href_url) id = short_url_parts[len(short_url_parts) - 1] url = urlparse.urljoin(current_url, href_url) title = "" title_list1 = item.xpath(".//text()") if len(title_list1) > 0: title = title_list1[0] print title is_title_empty = self.doraemon.isEmpty(title) if (is_title_empty is False) and ( self.doraemon.isDuplicated(title) is False): data = { 'title': title.strip(), 'url': url.strip(), 'id': id.strip(), 'download_time': self.today } self.file.logger( self.log_path, 'Start to store mongo {0}'.format(data['url'])) print 'Start to store mongo {0}'.format(data['url']) self.doraemon.storeMongodb(self.mongo, data) self.file.logger( self.log_path, 'End to store mongo {0}'.format(data['url'])) print 'End to store mongo {0}'.format(data['url']) self.file.logger(self.log_path, 'Done for {0}'.format(url)) else: if is_title_empty is True: self.file.logger(self.log_path, 'Empty title for {0}'.format(url)) print 'Empty title for {0}'.format(url) print 'Finished or Empty title for {0}'.format(url) else: self.file.logger(self.log_path, 'Invalid {0}'.format(href_url)) print 'Invalid {0}'.format(href_url) print 'End to parse {0}'.format(href_url) def start_requests(self): if self.doraemon.isExceedRestartInterval( self.restart_path, self.restart_interval) is False: return self.file.logger(self.log_path, 'Start {0} requests'.format(self.name)) print 'Start {0} requests'.format(self.name) self.badkeys = [] self.goodkeys = [] new_urls = [] content = self.file.readFromTxt(self.urls) url_list = content.split('\n') for url in url_list: if self.doraemon.isEmpty(url) is False: new_urls.append([url, '']) if len(new_urls) == 0: print 'No url.' return request = BrowserRequest() content = request.start_chrome(new_urls, self.max_pool_size, self.log_path, None, callback=self.parse) self.file.logger( self.log_path, 'End for {0} requests of {1}.'.format(str(len(content)), self.name)) print 'End for {0} requests of {1}.'.format(str(len(content)), self.name)

Exemplo n.º 2

0

Exibir arquivo

class Huxiu(): def __init__(self): self.settings = Settings() self.getSettings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.doraemon.createFilePath(self.work_path_prd1) self.doraemon.createFilePath(self.settings.LOG_PATH) def getSettings(self): settings_name = self.settings.CreateSettings('huxiu') self.source = settings_name['SOURCE_NAME'] self.work_path_prd1 = settings_name['WORK_PATH_PRD1'] self.finished_txt_path = '/home/dev/Data/rsyncData/huxiu_nlp/text/' self.url_path = '/home/dev/Data/rsyncData/huxiu_nlp/huxiu_nlp.csv' self.mongo = 'huxiu_nlp' self.name = settings_name['NAME'] self.max_pool_size = 4 self.log_path = self.settings.LOG_PATH self.today = self.settings.TODAY self.is_open_cache = settings_name['IS_OPEN_CACHE'] def parse(self, response): current_url = response['response'].current_url.encode('gbk') print 'Start to parse: {0}'.format(current_url) html = etree.HTML(response['response'].page_source) data = {} comment_number = "" title = "" url = "" id = "" share_number = "" image_url = "" content = "" time = "" author_url = "" author_name = "" valid = False url = current_url id = str(filter(str.isdigit, current_url.encode('gbk'))) title1 = html.xpath(".//*[contains(@class,'t-h1')]/text()") comment_number1 = html.xpath( ".//*[contains(@class, 'article-pl pull-left')]/text()") share_number1 = html.xpath( ".//*[contains(@class, 'article-share pull-left')]/text()") image_url1 = html.xpath( ".//*[contains(@class, 'article-img-box')]/img/@src") content1 = html.xpath( ".//div[contains(@class, 'article-content-wrap')]//text()") time1 = html.xpath(".//*[contains(@class, 'article-time')]/text()") author_url1 = html.xpath( ".//*[contains(@class, 'author-name')]/a/@href") author_name1 = html.xpath( ".//*[contains(@class, 'author-name')]/a/text()") if self.doraemon.isEmpty(title1) is False: title = title1[0].strip() if self.doraemon.isEmpty(comment_number1) is False: comment_number = str( filter(str.isdigit, comment_number1[0].encode('gbk'))).strip() if self.doraemon.isEmpty(share_number1) is False: share_number = str( filter(str.isdigit, share_number1[0].encode('gbk'))).strip() if self.doraemon.isEmpty(image_url1) is False: image_url = image_url1[0].strip() if self.doraemon.isEmpty(content1) is False: content = ''.join(content1).strip() valid = True if self.doraemon.isEmpty(time1) is False: time = ''.join(time1).strip() time = self.doraemon.getDateFromString(time) if self.doraemon.isEmpty(author_url1) is False: author_url = urlparse.urljoin(current_url, author_url1[0].strip()) if self.doraemon.isEmpty(author_name1) is False: author_name = ''.join(author_name1[0]).strip() data = { 'title': title, 'comment_number': comment_number, 'share_number': share_number, 'image_url': image_url, 'url': url, 'public_time': time, 'author_url': author_url, 'author_name': author_name, 'id': id, 'download_time': self.today, 'is_open_cache': self.is_open_cache, 'source': self.source } print 'End to parse: {0}'.format(current_url) if valid == True and self.doraemon.isEmpty(title) is False: self.file.logger(self.log_path, 'Start to store mongo {0}'.format(data['url'])) print 'Start to store mongo {0}'.format(data['url']) self.doraemon.storeMongodb(self.mongo, data) self.file.logger(self.log_path, 'End to store mongo {0}'.format(data['url'])) print 'End to store mongo {0}'.format(data['url']) self.doraemon.storeTxt(id, content, self.finished_txt_path, self.name) self.doraemon.storeFinished(self.doraemon.bf_huxiu_nlp, response['request_title']) else: self.doraemon.storeFinished(self.doraemon.bf_huxiu_nlp, response['request_title']) del current_url, html, title, comment_number, share_number, image_url, url, content, time, author_url, author_name, id, data gc.collect() def start_requests(self): self.file.logger(self.log_path, 'Start request: {0}'.format(self.name)) print 'Start ' + self.name + ' requests' new_url_titles = self.doraemon.readNewUrls(self.doraemon.bf_huxiu_nlp, self.url_path) # new_url_titles = [['https://www.huxiu.com/article/36.html', '【WHAT】十年内10大互联网IPO']] if len(new_url_titles) == 0: self.file.logger(self.log_path, 'No new url for: {0}'.format(self.name)) print 'No new url for: {0}'.format(self.name) return request = BrowserRequest() content = request.start_chrome(new_url_titles, self.max_pool_size, self.log_path, None, callback=self.parse) self.file.logger(self.log_path, 'End requests: {0}'.format(str(len(content)))) print 'End requests: {0}'.format(str(len(content))) del new_url_titles, request, content gc.collect()