Пример #1
0
 def get_random_proxy_https(self):
     '''随机从数据库中读取proxy'''
     dbpool = MysqlPool()
     select_sql = "select proxies_link from proxies where proxies_status = 1 and proxies_type = 'https'"
     results = dbpool.getAll(select_sql)
     if not results:
         print '!!!!!!!!!!!代理数据库空!!!!!!!!!'
         time.sleep(100)
         return 0
         # dbpool = MysqlPool()
         # results = dbpool.getAll(select_sql)
     res = random.choice(results)
     return res['proxies_link']
Пример #2
0
class GooglePaperPipeline(object):
    '''
       保存到数据库中对应的class
          1、在settings.py文件中配置
          2、在自己实现的爬虫类中yield item,会自动执行
       '''

    dbpool = MysqlPool()

    # pipeline默认调用
    def process_item(self, item, spider):

        paper_id = item['paper_id']
        paper_nbCitation = item['paper_nbCitation']
        paper_citationURL = item['paper_citationURL']
        paper_rawURL = item['paper_rawURL']
        paper_isseen = item['paper_isseen']
        paper_pdfURL = item['paper_pdfURL']
        paper_scholarInfo = item['paper_scholarInfo']
        paper_rawInfo = item['paper_rawInfo']
        paper_relatedURL = item['paper_relatedURL']

        sql_update = "UPDATE paper SET paper_nbCitation = '%d'\
        					, paper_isseen= '%d', paper_citationURL = '%s', paper_pdfURL = '%s'\
        					, paper_rawURL= '%s', paper_scholarInfo = '%s', paper_rawInfo = '%s', paper_relatedURL = '%s'\
        					WHERE paper_id='%d'" \
                     % (paper_nbCitation, paper_isseen, paper_citationURL.replace('\'', '\\\'').strip(),
                        paper_pdfURL.replace('\'', '\\\'').strip(), paper_rawURL.replace('\'', '\\\'').strip(),
                        paper_scholarInfo.replace('\'', '\\\'').strip(), paper_rawInfo.replace('\'', '\\\'').strip(),
                        paper_relatedURL.replace('\'', '\\\'').strip(), paper_id)

        self.dbpool.update(sql_update)
        self.dbpool.end()
        print paper_id, ' is updated successful!'
Пример #3
0
class DblpPaperPipeline(object):
    '''
           保存到数据库中对应的class
              1、在settings.py文件中配置
              2、在自己实现的爬虫类中yield item,会自动执行
           '''

    dbpool = MysqlPool()

    # pipeline默认调用
    def process_item(self, item, spider):
        paper_id = item['paper_id']
        dblp_name = item['name']
        dblp_year = item['year']

        if dblp_year != -1:
            sql_update = "update targetpaper set targetPaper_dblp_name = %s, targetPaper_publicationYear = %s " \
                         "where targetPaper_id = %s"
            params = (dblp_name, dblp_year, paper_id)
        else:
            sql_update = "update targetpaper set targetPaper_dblp_name = %s " \
                         "where targetPaper_id = %s"
            params = (dblp_name, paper_id)

        # print 'params', params
        self.dbpool.update(sql_update, params)
        self.dbpool.end()
        print paper_id, ' is updated successful!'
Пример #4
0
class DblpPipeline(object):
    '''
    保存到数据库中对应的class
       1、在settings.py文件中配置
       2、在自己实现的爬虫类中yield item,会自动执行
    '''

    dbpool = MysqlPool()

    # pipeline默认调用
    def process_item(self, item, spider):
        venue_type = item["venue_type"]

        if venue_type == 'CCF':
            self.ccf_dblp(item)
        elif venue_type == 'CORE':
            self.core_dblp(item)
        else:
            print 'No this type:', venue_type


    def ccf_dblp(self, item):
        """
        存入ccf的表dblp名称
        :param item: 传递过来的内容
        :return: 
        """

        dblp_name = item["name"]
        venue_id = item["venue_id"]

        # 查询当前对应venue_id对应的dblp 名称
        select_sql = "SELECT CCF_dblpname, CCF_dblpname2, CCF_dblpname3 " \
                     "FROM ccf WHERE CCF_id = %s"
        dblp_ans = self.dbpool.getAll(select_sql, (venue_id,))
        dblp_ans = dblp_ans[0]
        ccf_dblpname1 = dblp_ans["CCF_dblpname"]
        ccf_dblpname2 = dblp_ans["CCF_dblpname2"]
        ccf_dblpname3 = dblp_ans["CCF_dblpname3"]
        print 'save to sql:', dblp_name
        # 按顺序先更新前面的名称
        if ccf_dblpname1 == "NOT IN DBLP":
            sql = "update ccf set CCF_dblpname = %s where CCF_id = %s "
        elif ccf_dblpname2 is None:
            sql = "update ccf set CCF_dblpname2 = %s where CCF_id = %s "
        else:
            sql = "update ccf set CCF_dblpname3 = %s where CCF_id = %s "

        self.dbpool.update(sql, (dblp_name, venue_id))

        self.dbpool.end()
        print venue_id, 'ccf is updated successful!'

    def core_dblp(self, item):
        """
        存入core表的dblp名称
        :param item: 传递过来的内容
        :return: 
        """

        dblp_name = item["name"]
        venue_id = item["venue_id"]

        # 查询当前对应venue_id对应的dblp 名称
        select_sql = "SELECT CORE_dblpname, CORE_dblpname2, CORE_dblpname3 " \
                     "FROM core WHERE CORE_id = %s"
        dblp_ans = self.dbpool.getAll(select_sql, (venue_id,))
        dblp_ans = dblp_ans[0]
        core_dblpname1 = dblp_ans["CORE_dblpname"]
        core_dblpname2 = dblp_ans["CORE_dblpname2"]
        core_dblpname3 = dblp_ans["CORE_dblpname3"]
        print 'save to sql:', dblp_name
        # 按顺序先更新前面的名称
        if core_dblpname1 == "NOT IN DBLP":
            sql = "update core set CORE_dblpname = %s where CORE_id = %s "
        elif core_dblpname2 is None:
            sql = "update core set CORE_dblpname2 = %s where CORE_id = %s "
        else:
            sql = "update core set CORE_dblpname3 = %s where CORE_id = %s "

        self.dbpool.update(sql, (dblp_name, venue_id))

        self.dbpool.end()
        print venue_id, ' core is updated successful!'
Пример #5
0
class DblpSpider(scrapy.Spider):

    name = "dblpSpider_jour"

    # 使用对应的pipline存储类
    custom_settings = {
        'ITEM_PIPELINES': {
            'paperScrapy.pipelines.DblpPipeline': 1,
        }
    }

    headers = {
        'Host': 'dblp.uni-trier.de',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
        'Accept':
        'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01',
        'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Referer': 'http://dblp.uni-trier.de/',
        'Cookie':
        'dblp-hideable-show-feeds=true; dblp-hideable-show-rawdata=true; dblp-view=y; dblp-search-mode=c',
        'Connection': 'keep-alive',
        'Cache-Control': 'max-age=0',
    }

    # headers = {
    #     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
    # }

    mypool = MysqlPool()  # 创建连接池

    # 查询ccf中为进行dblp匹配
    ccf_sql_select = "SELECT CCF_id, CCF_name " \
                 "FROM ccf WHERE CCF_id<10000000 AND CCF_dblpname = %s and CCF_type = 'journal'"

    ccf_venue_set = mypool.getAll(ccf_sql_select,
                                  ("NOT IN DBLP", ))  # 记录所有待查询的venue集合

    #  查询core中为进行dblp匹配
    core_sql_select = "SELECT CORE_id, CORE_name " \
                 "FROM core WHERE CORE_id<10000000 AND CORE_dblpname = %s and CORE_type = 'journal'"

    core_venue_set = mypool.getAll(core_sql_select,
                                   ("NOT IN DBLP", ))  # 记录所有待查询的venue集合

    # 获取初始request
    def start_requests(self):

        # for i in range(len(self.ccf_venue_set)):
        #
        #     # 从CCF集合中取出
        #     venue_name = self.ccf_venue_set[i]["CCF_name"]
        #     venue_id = self.ccf_venue_set[i]["CCF_id"]
        #     line = venue_name.replace("%", "%25").replace(" ", "%20").replace(",", "%2C")\
        #         .replace(":", "%3A").replace("?", "%3F").replace("&", "%26").replace("'", "%27")
        #     url = 'http://dblp.uni-trier.de/search?q=' + line
        #
        #     # 通过meta传递参数venue_id、venue_type,方便后续的数据库存取
        #     venue_type = 'CCF'
        #     yield Request(url, headers=self.headers,
        #                   meta={'venue_id': venue_id, 'venue_type': venue_type},
        #                   callback=self.parse_venue)
        #     sleep(2)        #休眠

        for i in range(len(self.core_venue_set)):
            # 从Core集合中取出
            venue_name = self.core_venue_set[i]["CORE_name"]
            venue_id = self.core_venue_set[i]["CORE_id"]
            line = venue_name.replace("%", "%25").replace(" ", "%20").replace(",", "%2C")\
                .replace(":", "%3A").replace("?", "%3F").replace("&", "%26").replace("'", "%27")
            url = 'http://dblp.uni-trier.de/search?q=' + line

            # 通过meta传递参数venue_id、venue_type,方便后续的数据库存取
            venue_type = 'CORE'
            yield Request(url,
                          headers=self.headers,
                          meta={
                              'venue_id': venue_id,
                              'venue_type': venue_type
                          },
                          callback=self.parse_venue)
        #
        #     # sleep(2)        #休眠

    # 暂未使用
    def parse(self, response):
        item = PaperscrapyItem()  # 声明自己定义的item类
        yield item

    def parse_venue(self, response):
        """
        找到匹配到的期刊
        :param response: 输入期刊后得到的响应
        :return: 发起对期刊的请求
        """
        try:
            venue_id = response.meta['venue_id']  # 从meta取出变量venue_id
            venue_type = response.meta['venue_type']  # 从meta取出变量venue_type
            print 'parse_venue: venue_id', venue_id
            # 找到匹配到的href
            venue_ul = response.xpath(
                '//div[@id="completesearch-venues"]/div/ul'
            )  # 区分开exact和likely matches

            if len(venue_ul) == 0:
                raise Exception("No matches!")

            venue_url = venue_ul[0].xpath('.//li/a/@href').extract()
            href_num = len(venue_url)

            matches_type = response.xpath(
                '//*[@id="completesearch-venues"]/div/p[1]/text()').extract()

            # print 'the type is ', matches_type[0], 'the num of venue', href_num

            paper_type = 'journal'
            if href_num > 2:  # url多于两个,判定为连接过多
                raise Exception("Too many matches venue!")
            elif href_num == 2:  # 两个链接中去带有journals
                # 首先判断匹配类型
                if matches_type[0] != 'Exact matches':
                    raise Exception("Too many matches venue!")

                if paper_type not in venue_url[
                        0] and paper_type not in venue_url[1]:
                    raise Exception("Not matches venue!")
                # 关于两个都journal的情况,是认为匹配过多还是都满足呢?
                # if paper_type in venue_url[0] and paper_type in venue_url[1]:
                #     raise Exception("Too many matches venue!")
                # 把带有journal的链接付给第一个url
                if paper_type not in venue_url[0]:
                    venue_url[0] = venue_url[1]

            elif href_num == 1:
                if paper_type not in venue_url[0]:
                    raise Exception("Not matches venue!")

        except Exception, e:  # 匹配到多个或者没匹配到
            print e.args[0]
            # print 'venue_url',venue_url
        else:
Пример #6
0
class DblpSpider(scrapy.Spider):

    name = "dblpSpider_conf"

    # 使用对应的pipline存储类
    custom_settings = {
        'ITEM_PIPELINES': {
            'paperScrapy.pipelines.DblpPipeline': 1,
        }
    }

    headers = {
        'Host': 'dblp.uni-trier.de',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
        'Accept':
        'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01',
        'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Referer': 'http://dblp.uni-trier.de/',
        'Cookie':
        'dblp-hideable-show-feeds=true; dblp-hideable-show-rawdata=true; dblp-view=y; dblp-search-mode=c',
        'Connection': 'keep-alive',
        'Cache-Control': 'max-age=0',
    }

    # headers = {
    #     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
    # }

    mypool = MysqlPool()  # 创建连接池

    # 查询ccf中为进行dblp匹配
    ccf_sql_select = "SELECT CCF_id, CCF_name, CCF_abbreviation " \
                 "FROM ccf WHERE CCF_id<10000000 AND CCF_dblpname = %s and CCF_type = 'conference'"

    ccf_venue_set = mypool.getAll(ccf_sql_select, ("MDM", ))  # 记录所有待查询的venue集合

    #  查询core中为进行dblp匹配
    core_sql_select = "SELECT CORE_id, CORE_name, CORE_abbreviation " \
                 "FROM core WHERE CORE_id<10000000 AND CORE_dblpname = %s and CORE_type = 'conference'"

    core_venue_set = mypool.getAll(core_sql_select,
                                   ("NOT IN DBLP", ))  # 记录所有待查询的venue集合

    #  查询mag中为进行dblp匹配
    mag_sql_select = "SELECT MAG_id, MAG_name, MAG_abbreviation " \
                      "FROM mag WHERE mag_dblpname = %s "

    mag_venue_set = mypool.getAll(mag_sql_select,
                                  ("NOT IN DBLP", ))  # 记录所有待查询的venue集合

    #
    # 获取初始request
    def start_requests(self):

        venue_type = 'CCF'
        if venue_type == 'CCF':
            target_set = self.ccf_venue_set
        elif venue_type == 'CORE':
            target_set = self.core_venue_set
        elif venue_type == 'MAG':
            target_set = self.mag_venue_set
        else:
            print 'The type does not exit!!!'
            return

        # for i in range(len(target_set)-1, -1, -1):
        num = len(target_set)
        print 'the total num is', num
        for i in range(len(target_set)):

            # 从集合中取出

            vid = venue_type + '_id'
            vname = venue_type + '_name'
            # vname = venue_type + '_abbreviation'

            venue_id = target_set[i][vid]
            venue_name = target_set[i][vname]

            line = venue_name.replace("%", "%25").replace(" ", "%20").replace(",", "%2C")\
                .replace(":", "%3A").replace("?", "%3F").replace("&", "%26").replace("'", "%27")
            url = 'http://dblp.uni-trier.de/search?q=' + line

            # 通过meta传递参数venue_id、venue_type,方便后续的数据库存取

            yield Request(url,
                          headers=self.headers,
                          meta={
                              'venue_id': venue_id,
                              'venue_type': venue_type
                          },
                          callback=self.parse_venue)
            num -= 1
            print '-------- left', num, '-----------'

    def parse_venue(self, response):
        """
        找到匹配到的期刊
        :param response: 输入期刊后得到的响应
        :return: 发起对期刊的请求
        """
        try:
            venue_id = response.meta['venue_id']  # 从meta取出变量venue_id
            venue_type = response.meta['venue_type']  # 从meta取出变量venue_type
            dblp_name = 'NOT IN DBLP'
            print 'parse_venue: venue_id', venue_id

            # 找到结果ul块
            venue_ul = response.xpath(
                '//div[@id="completesearch-venues"]/div/ul'
            )  # 区分开exact和likely matches
            if len(venue_ul) == 0:
                raise Exception("No matches!")

            # 找到匹配到的href
            venue_url = venue_ul[0].xpath('.//li/a/@href').extract()

            matches_type = response.xpath(
                '//*[@id="completesearch-venues"]/div/p[1]/text()').extract()
            # matches_name = venue_ul[0].xpath('.//li[1]/a/text()').extract()
            # //*[@id="completesearch-venues"]/div/ul/li/a/text()
            # print 'matches_name', matches_name
            # tmp_name = re.match(".*\((.*)\).*", matches_name[-1]).group(1)
            # print 'tmp_name:', tmp_name

            print 'the original url list is ', venue_url
            # 获取conf的链接
            conf_url = set()
            paper_type = 'conf'
            for vurl in venue_url:
                if paper_type in vurl:
                    conf_url.add(vurl)

            # 筛选名称
            conf_url = list(conf_url)
            print 'the new url list is ', conf_url
            conf_num = len(conf_url)
            if conf_num == 0:
                raise Exception("No matches conference!")
            elif conf_num == 1:
                tmp_name = conf_url[0].split('/')[-2]
                dblp_name = tmp_name.upper()
            else:
                dblp_name = 'MORE'
                if matches_type[0] == 'Exact matches':
                    raise Exception("Too many matches in exact matches!")
                else:
                    raise Exception("Too many matches in likely matches!")

        except Exception, e:  # 匹配到多个或者没匹配到
            print e.args[0]
            # yield Request(venue_url[0], headers=self.headers,
            #               meta={'venue_id': venue_id, 'venue_type': venue_type},
            #               callback=self.parse_short)
            # print 'venue_url',venue_url
        # else:
        paper_item = PaperscrapyItem()  # 声明自己定义的item类 并赋值
        print 'dblp_name is', dblp_name
        paper_item['name'] = dblp_name
        paper_item['venue_id'] = venue_id
        paper_item['venue_type'] = venue_type
        yield paper_item