def get_random_proxy_https(self): '''随机从数据库中读取proxy''' dbpool = MysqlPool() select_sql = "select proxies_link from proxies where proxies_status = 1 and proxies_type = 'https'" results = dbpool.getAll(select_sql) if not results: print '!!!!!!!!!!!代理数据库空!!!!!!!!!' time.sleep(100) return 0 # dbpool = MysqlPool() # results = dbpool.getAll(select_sql) res = random.choice(results) return res['proxies_link']
class GooglePaperPipeline(object): ''' 保存到数据库中对应的class 1、在settings.py文件中配置 2、在自己实现的爬虫类中yield item,会自动执行 ''' dbpool = MysqlPool() # pipeline默认调用 def process_item(self, item, spider): paper_id = item['paper_id'] paper_nbCitation = item['paper_nbCitation'] paper_citationURL = item['paper_citationURL'] paper_rawURL = item['paper_rawURL'] paper_isseen = item['paper_isseen'] paper_pdfURL = item['paper_pdfURL'] paper_scholarInfo = item['paper_scholarInfo'] paper_rawInfo = item['paper_rawInfo'] paper_relatedURL = item['paper_relatedURL'] sql_update = "UPDATE paper SET paper_nbCitation = '%d'\ , paper_isseen= '%d', paper_citationURL = '%s', paper_pdfURL = '%s'\ , paper_rawURL= '%s', paper_scholarInfo = '%s', paper_rawInfo = '%s', paper_relatedURL = '%s'\ WHERE paper_id='%d'" \ % (paper_nbCitation, paper_isseen, paper_citationURL.replace('\'', '\\\'').strip(), paper_pdfURL.replace('\'', '\\\'').strip(), paper_rawURL.replace('\'', '\\\'').strip(), paper_scholarInfo.replace('\'', '\\\'').strip(), paper_rawInfo.replace('\'', '\\\'').strip(), paper_relatedURL.replace('\'', '\\\'').strip(), paper_id) self.dbpool.update(sql_update) self.dbpool.end() print paper_id, ' is updated successful!'
class DblpPaperPipeline(object): ''' 保存到数据库中对应的class 1、在settings.py文件中配置 2、在自己实现的爬虫类中yield item,会自动执行 ''' dbpool = MysqlPool() # pipeline默认调用 def process_item(self, item, spider): paper_id = item['paper_id'] dblp_name = item['name'] dblp_year = item['year'] if dblp_year != -1: sql_update = "update targetpaper set targetPaper_dblp_name = %s, targetPaper_publicationYear = %s " \ "where targetPaper_id = %s" params = (dblp_name, dblp_year, paper_id) else: sql_update = "update targetpaper set targetPaper_dblp_name = %s " \ "where targetPaper_id = %s" params = (dblp_name, paper_id) # print 'params', params self.dbpool.update(sql_update, params) self.dbpool.end() print paper_id, ' is updated successful!'
class DblpPipeline(object): ''' 保存到数据库中对应的class 1、在settings.py文件中配置 2、在自己实现的爬虫类中yield item,会自动执行 ''' dbpool = MysqlPool() # pipeline默认调用 def process_item(self, item, spider): venue_type = item["venue_type"] if venue_type == 'CCF': self.ccf_dblp(item) elif venue_type == 'CORE': self.core_dblp(item) else: print 'No this type:', venue_type def ccf_dblp(self, item): """ 存入ccf的表dblp名称 :param item: 传递过来的内容 :return: """ dblp_name = item["name"] venue_id = item["venue_id"] # 查询当前对应venue_id对应的dblp 名称 select_sql = "SELECT CCF_dblpname, CCF_dblpname2, CCF_dblpname3 " \ "FROM ccf WHERE CCF_id = %s" dblp_ans = self.dbpool.getAll(select_sql, (venue_id,)) dblp_ans = dblp_ans[0] ccf_dblpname1 = dblp_ans["CCF_dblpname"] ccf_dblpname2 = dblp_ans["CCF_dblpname2"] ccf_dblpname3 = dblp_ans["CCF_dblpname3"] print 'save to sql:', dblp_name # 按顺序先更新前面的名称 if ccf_dblpname1 == "NOT IN DBLP": sql = "update ccf set CCF_dblpname = %s where CCF_id = %s " elif ccf_dblpname2 is None: sql = "update ccf set CCF_dblpname2 = %s where CCF_id = %s " else: sql = "update ccf set CCF_dblpname3 = %s where CCF_id = %s " self.dbpool.update(sql, (dblp_name, venue_id)) self.dbpool.end() print venue_id, 'ccf is updated successful!' def core_dblp(self, item): """ 存入core表的dblp名称 :param item: 传递过来的内容 :return: """ dblp_name = item["name"] venue_id = item["venue_id"] # 查询当前对应venue_id对应的dblp 名称 select_sql = "SELECT CORE_dblpname, CORE_dblpname2, CORE_dblpname3 " \ "FROM core WHERE CORE_id = %s" dblp_ans = self.dbpool.getAll(select_sql, (venue_id,)) dblp_ans = dblp_ans[0] core_dblpname1 = dblp_ans["CORE_dblpname"] core_dblpname2 = dblp_ans["CORE_dblpname2"] core_dblpname3 = dblp_ans["CORE_dblpname3"] print 'save to sql:', dblp_name # 按顺序先更新前面的名称 if core_dblpname1 == "NOT IN DBLP": sql = "update core set CORE_dblpname = %s where CORE_id = %s " elif core_dblpname2 is None: sql = "update core set CORE_dblpname2 = %s where CORE_id = %s " else: sql = "update core set CORE_dblpname3 = %s where CORE_id = %s " self.dbpool.update(sql, (dblp_name, venue_id)) self.dbpool.end() print venue_id, ' core is updated successful!'
class DblpSpider(scrapy.Spider): name = "dblpSpider_jour" # 使用对应的pipline存储类 custom_settings = { 'ITEM_PIPELINES': { 'paperScrapy.pipelines.DblpPipeline': 1, } } headers = { 'Host': 'dblp.uni-trier.de', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', 'Accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4', 'Accept-Encoding': 'gzip, deflate, sdch', 'Referer': 'http://dblp.uni-trier.de/', 'Cookie': 'dblp-hideable-show-feeds=true; dblp-hideable-show-rawdata=true; dblp-view=y; dblp-search-mode=c', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', } # headers = { # 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36', # } mypool = MysqlPool() # 创建连接池 # 查询ccf中为进行dblp匹配 ccf_sql_select = "SELECT CCF_id, CCF_name " \ "FROM ccf WHERE CCF_id<10000000 AND CCF_dblpname = %s and CCF_type = 'journal'" ccf_venue_set = mypool.getAll(ccf_sql_select, ("NOT IN DBLP", )) # 记录所有待查询的venue集合 # 查询core中为进行dblp匹配 core_sql_select = "SELECT CORE_id, CORE_name " \ "FROM core WHERE CORE_id<10000000 AND CORE_dblpname = %s and CORE_type = 'journal'" core_venue_set = mypool.getAll(core_sql_select, ("NOT IN DBLP", )) # 记录所有待查询的venue集合 # 获取初始request def start_requests(self): # for i in range(len(self.ccf_venue_set)): # # # 从CCF集合中取出 # venue_name = self.ccf_venue_set[i]["CCF_name"] # venue_id = self.ccf_venue_set[i]["CCF_id"] # line = venue_name.replace("%", "%25").replace(" ", "%20").replace(",", "%2C")\ # .replace(":", "%3A").replace("?", "%3F").replace("&", "%26").replace("'", "%27") # url = 'http://dblp.uni-trier.de/search?q=' + line # # # 通过meta传递参数venue_id、venue_type,方便后续的数据库存取 # venue_type = 'CCF' # yield Request(url, headers=self.headers, # meta={'venue_id': venue_id, 'venue_type': venue_type}, # callback=self.parse_venue) # sleep(2) #休眠 for i in range(len(self.core_venue_set)): # 从Core集合中取出 venue_name = self.core_venue_set[i]["CORE_name"] venue_id = self.core_venue_set[i]["CORE_id"] line = venue_name.replace("%", "%25").replace(" ", "%20").replace(",", "%2C")\ .replace(":", "%3A").replace("?", "%3F").replace("&", "%26").replace("'", "%27") url = 'http://dblp.uni-trier.de/search?q=' + line # 通过meta传递参数venue_id、venue_type,方便后续的数据库存取 venue_type = 'CORE' yield Request(url, headers=self.headers, meta={ 'venue_id': venue_id, 'venue_type': venue_type }, callback=self.parse_venue) # # # sleep(2) #休眠 # 暂未使用 def parse(self, response): item = PaperscrapyItem() # 声明自己定义的item类 yield item def parse_venue(self, response): """ 找到匹配到的期刊 :param response: 输入期刊后得到的响应 :return: 发起对期刊的请求 """ try: venue_id = response.meta['venue_id'] # 从meta取出变量venue_id venue_type = response.meta['venue_type'] # 从meta取出变量venue_type print 'parse_venue: venue_id', venue_id # 找到匹配到的href venue_ul = response.xpath( '//div[@id="completesearch-venues"]/div/ul' ) # 区分开exact和likely matches if len(venue_ul) == 0: raise Exception("No matches!") venue_url = venue_ul[0].xpath('.//li/a/@href').extract() href_num = len(venue_url) matches_type = response.xpath( '//*[@id="completesearch-venues"]/div/p[1]/text()').extract() # print 'the type is ', matches_type[0], 'the num of venue', href_num paper_type = 'journal' if href_num > 2: # url多于两个,判定为连接过多 raise Exception("Too many matches venue!") elif href_num == 2: # 两个链接中去带有journals # 首先判断匹配类型 if matches_type[0] != 'Exact matches': raise Exception("Too many matches venue!") if paper_type not in venue_url[ 0] and paper_type not in venue_url[1]: raise Exception("Not matches venue!") # 关于两个都journal的情况,是认为匹配过多还是都满足呢? # if paper_type in venue_url[0] and paper_type in venue_url[1]: # raise Exception("Too many matches venue!") # 把带有journal的链接付给第一个url if paper_type not in venue_url[0]: venue_url[0] = venue_url[1] elif href_num == 1: if paper_type not in venue_url[0]: raise Exception("Not matches venue!") except Exception, e: # 匹配到多个或者没匹配到 print e.args[0] # print 'venue_url',venue_url else:
class DblpSpider(scrapy.Spider): name = "dblpSpider_conf" # 使用对应的pipline存储类 custom_settings = { 'ITEM_PIPELINES': { 'paperScrapy.pipelines.DblpPipeline': 1, } } headers = { 'Host': 'dblp.uni-trier.de', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', 'Accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4', 'Accept-Encoding': 'gzip, deflate, sdch', 'Referer': 'http://dblp.uni-trier.de/', 'Cookie': 'dblp-hideable-show-feeds=true; dblp-hideable-show-rawdata=true; dblp-view=y; dblp-search-mode=c', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', } # headers = { # 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36', # } mypool = MysqlPool() # 创建连接池 # 查询ccf中为进行dblp匹配 ccf_sql_select = "SELECT CCF_id, CCF_name, CCF_abbreviation " \ "FROM ccf WHERE CCF_id<10000000 AND CCF_dblpname = %s and CCF_type = 'conference'" ccf_venue_set = mypool.getAll(ccf_sql_select, ("MDM", )) # 记录所有待查询的venue集合 # 查询core中为进行dblp匹配 core_sql_select = "SELECT CORE_id, CORE_name, CORE_abbreviation " \ "FROM core WHERE CORE_id<10000000 AND CORE_dblpname = %s and CORE_type = 'conference'" core_venue_set = mypool.getAll(core_sql_select, ("NOT IN DBLP", )) # 记录所有待查询的venue集合 # 查询mag中为进行dblp匹配 mag_sql_select = "SELECT MAG_id, MAG_name, MAG_abbreviation " \ "FROM mag WHERE mag_dblpname = %s " mag_venue_set = mypool.getAll(mag_sql_select, ("NOT IN DBLP", )) # 记录所有待查询的venue集合 # # 获取初始request def start_requests(self): venue_type = 'CCF' if venue_type == 'CCF': target_set = self.ccf_venue_set elif venue_type == 'CORE': target_set = self.core_venue_set elif venue_type == 'MAG': target_set = self.mag_venue_set else: print 'The type does not exit!!!' return # for i in range(len(target_set)-1, -1, -1): num = len(target_set) print 'the total num is', num for i in range(len(target_set)): # 从集合中取出 vid = venue_type + '_id' vname = venue_type + '_name' # vname = venue_type + '_abbreviation' venue_id = target_set[i][vid] venue_name = target_set[i][vname] line = venue_name.replace("%", "%25").replace(" ", "%20").replace(",", "%2C")\ .replace(":", "%3A").replace("?", "%3F").replace("&", "%26").replace("'", "%27") url = 'http://dblp.uni-trier.de/search?q=' + line # 通过meta传递参数venue_id、venue_type,方便后续的数据库存取 yield Request(url, headers=self.headers, meta={ 'venue_id': venue_id, 'venue_type': venue_type }, callback=self.parse_venue) num -= 1 print '-------- left', num, '-----------' def parse_venue(self, response): """ 找到匹配到的期刊 :param response: 输入期刊后得到的响应 :return: 发起对期刊的请求 """ try: venue_id = response.meta['venue_id'] # 从meta取出变量venue_id venue_type = response.meta['venue_type'] # 从meta取出变量venue_type dblp_name = 'NOT IN DBLP' print 'parse_venue: venue_id', venue_id # 找到结果ul块 venue_ul = response.xpath( '//div[@id="completesearch-venues"]/div/ul' ) # 区分开exact和likely matches if len(venue_ul) == 0: raise Exception("No matches!") # 找到匹配到的href venue_url = venue_ul[0].xpath('.//li/a/@href').extract() matches_type = response.xpath( '//*[@id="completesearch-venues"]/div/p[1]/text()').extract() # matches_name = venue_ul[0].xpath('.//li[1]/a/text()').extract() # //*[@id="completesearch-venues"]/div/ul/li/a/text() # print 'matches_name', matches_name # tmp_name = re.match(".*\((.*)\).*", matches_name[-1]).group(1) # print 'tmp_name:', tmp_name print 'the original url list is ', venue_url # 获取conf的链接 conf_url = set() paper_type = 'conf' for vurl in venue_url: if paper_type in vurl: conf_url.add(vurl) # 筛选名称 conf_url = list(conf_url) print 'the new url list is ', conf_url conf_num = len(conf_url) if conf_num == 0: raise Exception("No matches conference!") elif conf_num == 1: tmp_name = conf_url[0].split('/')[-2] dblp_name = tmp_name.upper() else: dblp_name = 'MORE' if matches_type[0] == 'Exact matches': raise Exception("Too many matches in exact matches!") else: raise Exception("Too many matches in likely matches!") except Exception, e: # 匹配到多个或者没匹配到 print e.args[0] # yield Request(venue_url[0], headers=self.headers, # meta={'venue_id': venue_id, 'venue_type': venue_type}, # callback=self.parse_short) # print 'venue_url',venue_url # else: paper_item = PaperscrapyItem() # 声明自己定义的item类 并赋值 print 'dblp_name is', dblp_name paper_item['name'] = dblp_name paper_item['venue_id'] = venue_id paper_item['venue_type'] = venue_type yield paper_item