Exemplo n.º 1
0
    def __init__(self):

        config = {'host': db_host, 'user': db_user, 'passwd': db_password}
        database = db_name
        self.db = Mysql(config, database)
        self.tableName = db_table_name
        self.item = None
Exemplo n.º 2
0
class ToutiaoPipeline(object):
    def __init__(self):

        config = {'host': db_host, 'user': db_user, 'passwd': db_password}
        database = db_name
        self.db = Mysql(config, database)
        self.tableName = 'bb_toutiao_sources'
        self.item = None

    def process_item(self, item, spider):

        if not item:
            logging.info('-----------------------list page repeat : %s' % item)
            return True

        public_time = int(time.time())
        create_time = int(time.time())

        for i in xrange(0, len(item['url'])):
            insertData = {
                'title': item['title'][i],
                'url': item['url'][i],
                'unique_code': toMd5(item['url'][i]),
                'share_num': item['share_num'][i],
                'rss_num': item['rss_num'][i],
                'public_time': public_time,
                'create_time': create_time
            }
            self.db.insert(self.tableName, insertData)

        return True
Exemplo n.º 3
0
class RssPipeline(object):
    def __init__(self):

        config = {'host': db_host, 'user': db_user, 'passwd': db_password}
        database = db_name
        self.db = Mysql(config, database)
        self.tableName = db_table_name
        self.item = None

    def process_item(self, item):

        if not item:
            logging.info('------------page not crawl data ')
            return True

        self.item = item
        insertDataList = self.filterAndPackageDgrate()
        for index in insertDataList:
            self.db.insert(self.tableName, insertDataList[index])

        return True

    def filterAndPackageDgrate(self):

        if not OPEN_REDIS_DISTINCT:
            return self.item

        uniqueCodeList = self.item.keys()
        repeatUniqueCode = requstDistinct(uniqueCodeList)
        logging.info('------------distinct before : %s ' % uniqueCodeList)
        for i, unique in enumerate(repeatUniqueCode):
            del (self.item[unique])
        logging.info('------------distinct after : %s ' % self.item.keys())
        return self.item
Exemplo n.º 4
0
class ToutiaoPipeline(object):

    def __init__(self):

        config = {'host': db_host, 'user': db_user, 'passwd': db_password}
        database = db_name
        self.db = Mysql(config, database)
        self.tableName = 'bb_toutiao_sources'
        self.item = None

    def process_item(self, item, spider):

        if not item:
            logging.info('-----------------------list page repeat : %s' % item)
            return True

        public_time = int(time.time())
        create_time = int(time.time())

        for i in xrange(0, len(item['url'])):
            insertData = {
                'title': item['title'][i],
                'url': item['url'][i],
                'unique_code': toMd5(item['url'][i]),
                'share_num': item['share_num'][i],
                'rss_num': item['rss_num'][i],
                'public_time': public_time,
                'create_time': create_time
            }
            self.db.insert(self.tableName, insertData)

        return True
Exemplo n.º 5
0
class RssPipeline(object):

    def __init__(self):

        config = {'host': db_host, 'user': db_user, 'passwd': db_password}
        database = db_name
        self.db = Mysql(config, database)
        self.tableName = db_table_name
        self.item = None

    def process_item(self, item):

        if not item:
            logging.info('------------page not crawl data ')
            return True

        self.item = item
        insertDataList = self.filterAndPackageDgrate()
        for index in insertDataList:
            self.db.insert(self.tableName, insertDataList[index])

        return True

    def filterAndPackageDgrate(self):

        if not OPEN_REDIS_DISTINCT:
            return self.item

        uniqueCodeList = self.item.keys()
        repeatUniqueCode = requstDistinct(uniqueCodeList)
        logging.info('------------distinct before : %s ' % uniqueCodeList)
        for i, unique in enumerate(repeatUniqueCode):
            del(self.item[unique])
        logging.info('------------distinct after : %s ' % self.item.keys())
        return self.item
Exemplo n.º 6
0
    def __init__(self):

        config = {'host': db_host, 'user': db_user, 'passwd': db_password}
        database = db_name
        self.db = Mysql(config, database)
        self.tableName = db_table_name
        self.item = None
Exemplo n.º 7
0
class SyncCrawlInfos(object):
    def __init__(self):

        config = {'host': db_host, 'user': db_user, 'passwd': db_password}
        database = db_name
        self.db = Mysql(config, database)
        self.tableName = db_table_name

    def getRecords(self):

        sql = "select * from " + self.tableName + " where is_sync=0 and sync_times<2 order by id asc limit  " + str(
            SYNC_RECORDS_NUMS)
        records = self.db.findAll(sql)
        if not records:
            return []

        return list(records)

    def index(self):

        beginTime = int(time.time())
        records = self.getRecords()
        if not records:
            logging.info('no data need sync!!')
            return False

        syncOverData = syncCrawlInfos(records)

        for record in records:
            uniqueCode = record['unique_code']
            if uniqueCode in syncOverData:
                print "sync success %s " % uniqueCode
                updateSql = "update " + self.tableName + " set `is_sync` = 1,`sync_times` = `sync_times`+1 where `unique_code` = '" + uniqueCode + "' "
            else:
                print "sync fail %s " % uniqueCode
                updateSql = "update " + self.tableName + " set `sync_times` = `sync_times`+1 where `unique_code` = '" + uniqueCode + "' "
            self.db.executeSql(updateSql)

        logging.info('--------------sync records cast time : %s ' %
                     (int(time.time()) - beginTime))
        logging.info('--------------sync records success num : %s' %
                     len(syncOverData))
        logging.info('--------------sync records success : %s' % syncOverData)
        logging.info('--------------sync records fail num : %s' %
                     (len(records) - len(syncOverData)))
        return True
Exemplo n.º 8
0
class SyncCrawlInfos(object):

    def __init__(self):

        config = {'host': db_host, 'user': db_user, 'passwd': db_password}
        database = db_name
        self.db = Mysql(config, database)
        self.tableName = db_table_name

    def getRecords(self):

        sql = "select * from " + self.tableName + " where is_sync=0 and sync_times<2 order by id asc limit  " + str(SYNC_RECORDS_NUMS)
        records = self.db.findAll(sql)
        if not records:
            return []

        return list(records)

    def index(self):

        beginTime = int(time.time())
        records = self.getRecords()
        if not records:
            logging.info('no data need sync!!')
            return False

        syncOverData = syncCrawlInfos(records)

        for record in records:
            uniqueCode = record['unique_code']
            if uniqueCode in syncOverData:
                print "sync success %s " % uniqueCode
                updateSql = "update "+self.tableName+" set `is_sync` = 1,`sync_times` = `sync_times`+1 where `unique_code` = '"+uniqueCode+"' "
            else:
                print "sync fail %s " % uniqueCode
                updateSql = "update "+self.tableName+" set `sync_times` = `sync_times`+1 where `unique_code` = '"+uniqueCode+"' "
            self.db.executeSql(updateSql)


        logging.info('--------------sync records cast time : %s ' % (int(time.time()) - beginTime)  )
        logging.info('--------------sync records success num : %s' % len(syncOverData))
        logging.info('--------------sync records success : %s' % syncOverData )
        logging.info('--------------sync records fail num : %s' % (len(records) - len(syncOverData)))
        return True
Exemplo n.º 9
0
class CrawlPipeline(object):
    def __init__(self):

        config = {'host': db_host, 'user': db_user, 'passwd': db_password}
        database = db_name
        self.db = Mysql(config, database)
        self.tableName = db_table_name
        self.item = None

    def process_item(self, item, spider):

        if not item:
            logging.info('--------item is empty : %s' % item)
            return True

        create_time = int(time.time())
        img_url = json.dumps(item['img_url'])
        if (not item['description']) and (not item['content']):
            return True

        title = item['title'].decode('utf8')[0:255].encode('utf8')
        insertData = {
            'source_url': item['source_url'],
            'unique_code': toMd5(item['source_url']),
            'rule_id': item['rule_id'],
            'title': title,
            'description': item['description'],
            'content': item['content'],
            'img_url': img_url,
            'source_score': item['source_score'],
            'is_sync': '0',
            'public_time': item['public_time'],
            'create_time': create_time
        }
        insertOk = self.db.insert(self.tableName, insertData)
        if (not insertOk) and spider.is_duplicate:
            self.db.update(self.tableName, insertData,
                           "unique_code = '" + insertData['unique_code'] + "'")
            logging.info('========update.unique_code : %s' %
                         insertData['unique_code'])

        return True
Exemplo n.º 10
0
class CrawlPipeline(object):

    def __init__(self):

        config = {'host': db_host, 'user': db_user, 'passwd': db_password}
        database = db_name
        self.db = Mysql(config, database)
        self.tableName = db_table_name
        self.item = None

    def process_item(self, item, spider):

        if not item:
            logging.info('--------item is empty : %s' % item)
            return True

        create_time = int(time.time())
        img_url = json.dumps(item['img_url'])
        if (not item['description']) and (not item['content']):
            return True

        title = item['title'].decode('utf8')[0:255].encode('utf8')
        insertData = {
            'source_url': item['source_url'],
            'unique_code': toMd5(item['source_url']),
            'rule_id': item['rule_id'],
            'title': title,
            'description': item['description'],
            'content': item['content'],
            'img_url': img_url,
            'source_score' : item['source_score'],
            'is_sync' : '0',
            'public_time': item['public_time'],
            'create_time': create_time
        }
        insertOk = self.db.insert(self.tableName, insertData)
        if ( not insertOk )and spider.is_duplicate:
            self.db.update(self.tableName, insertData, "unique_code = '" + insertData['unique_code'] + "'")
            logging.info('========update.unique_code : %s' % insertData['unique_code'])

        return True
Exemplo n.º 11
0
class CommonCrawlPipeline(object):

    def __init__(self):

        config = {'host': db_host, 'user': db_user, 'passwd': db_password}
        database = db_name
        self.db = Mysql(config, database)
        self.tableName = db_table_name
        self.item = None

    def process_item(self, item, spider):

        if not item:
            logging.info('--------item is empty : %s' % item)
            return True

        rule_id = item['rule_id']
        public_time = int(time.time())
        create_time = int(time.time())

        img_url = json.dumps(item['img_url'])
        description = item['description']
        if not description:
            return True

        title = item['title'].decode('utf8')[0:255].encode('utf8')
        insertData = {
            'source_url': item['source_url'],
            'unique_code': toMd5(item['source_url']),
            'rule_id': rule_id,
            'title': title,
            'description': description,
            'img_url': img_url,
            'public_time': public_time,
            'create_time': create_time
        }
        self.db.insert(self.tableName, insertData)
        return True
Exemplo n.º 12
0
class CommonCrawlPipeline(object):
    def __init__(self):

        config = {'host': db_host, 'user': db_user, 'passwd': db_password}
        database = db_name
        self.db = Mysql(config, database)
        self.tableName = db_table_name
        self.item = None

    def process_item(self, item, spider):

        if not item:
            logging.info('--------item is empty : %s' % item)
            return True

        rule_id = item['rule_id']
        public_time = int(time.time())
        create_time = int(time.time())

        img_url = json.dumps(item['img_url'])
        description = item['description']
        if not description:
            return True

        title = item['title'].decode('utf8')[0:255].encode('utf8')
        insertData = {
            'source_url': item['source_url'],
            'unique_code': toMd5(item['source_url']),
            'rule_id': rule_id,
            'title': title,
            'description': description,
            'img_url': img_url,
            'public_time': public_time,
            'create_time': create_time
        }
        self.db.insert(self.tableName, insertData)
        return True
Exemplo n.º 13
0
class XmlFeedPipeline(object):
    def __init__(self):

        config = {'host': db_host, 'user': db_user, 'passwd': db_password}
        database = db_name
        self.db = Mysql(config, database)
        self.tableName = db_table_name
        self.item = None

    def process_item(self, item, spider):

        if not item:
            logging.info('-----------------------list page repeat ')
            return True

        self.item = item
        insertDataList = self.filterAndPackageDgrate()
        for index in insertDataList:
            self.db.insert(self.tableName, insertDataList[index])

        return True

    def filterAndPackageDgrate(self):

        uniqueCodeList = []
        insertData = {}
        item = self.item

        rule_id = item['rule_id']
        public_time = int(time.time())
        create_time = int(time.time())

        for index, title in enumerate(item['title']):

            uniqueCode = toMd5(item['source_url'][index])
            if index < len(item['img_url']) and item['img_url'][index]:
                img_url = json.dumps(item['img_url'][index])
            else:
                img_url = ''

            if index < len(item['description']) and item['description'][index]:
                description = item['description'][index]
            else:
                continue

            title = title.decode('utf8')[0:255].encode('utf8')
            uniqueCodeList.append(uniqueCode)
            insertData[uniqueCode] = {
                'source_url': item['source_url'][index],
                'unique_code': uniqueCode,
                'rule_id': rule_id,
                'title': title,
                'description': description,
                'img_url': img_url,
                'public_time': public_time,
                'create_time': create_time
            }

        if uniqueCodeList and OPEN_REDIS_DISTINCT:
            repeatUniqueCode = requstDistinct(uniqueCodeList)
            for i, unique in enumerate(repeatUniqueCode):
                del (insertData[unique])

        return insertData
Exemplo n.º 14
0
class XmlFeedPipeline(object):

    def __init__(self):

        config = {'host': db_host, 'user': db_user, 'passwd': db_password}
        database = db_name
        self.db = Mysql(config, database)
        self.tableName = db_table_name
        self.item = None

    def process_item(self, item, spider):

        if not item:
            logging.info('-----------------------list page repeat ')
            return True

        self.item = item
        insertDataList = self.filterAndPackageDgrate()
        for index in insertDataList:
            self.db.insert(self.tableName, insertDataList[index])

        return True

    def filterAndPackageDgrate(self):

        uniqueCodeList = []
        insertData = {}
        item = self.item

        rule_id = item['rule_id']
        public_time = int(time.time())
        create_time = int(time.time())

        for index, title in enumerate(item['title']):

            uniqueCode = toMd5(item['source_url'][index])
            if index < len(item['img_url']) and item['img_url'][index]:
                img_url = json.dumps(item['img_url'][index])
            else:
                img_url = ''

            if index < len(item['description']) and item['description'][index]:
                description = item['description'][index]
            else:
                continue

            title = title.decode('utf8')[0:255].encode('utf8')
            uniqueCodeList.append(uniqueCode)
            insertData[uniqueCode] = {
                'source_url': item['source_url'][index],
                'unique_code': uniqueCode,
                'rule_id': rule_id,
                'title': title,
                'description': description,
                'img_url': img_url,
                'public_time': public_time,
                'create_time': create_time
            }

        if uniqueCodeList and OPEN_REDIS_DISTINCT:
            repeatUniqueCode = requstDistinct(uniqueCodeList)
            for i, unique in enumerate(repeatUniqueCode):
                del(insertData[unique])

        return insertData