def __init__(self): # host是redis主机,需要redis服务端和客户端都启动 self.pool = redis.ConnectionPool( host=mod_config.get_config("redis", "redis_host"), port=mod_config.get_config("redis", "redis_port"), decode_responses=True) self.r = redis.Redis(connection_pool=self.pool)
def __init__(self): self.bucket_name = mod_config.get_config('aliyun_oss', 'bucket_name') self.access_key = mod_config.get_config('aliyun_oss', 'access_key') self.access_secret_key = mod_config.get_config('aliyun_oss', 'access_secret_key') self.http_prefix = mod_config.get_config('aliyun_oss', 'http_prefix') self.auth = oss2.Auth(self.access_key, self.access_secret_key) self.bucket = oss2.Bucket(self.auth, self.http_prefix, self.bucket_name)
def __init__(self, table_name): """ 连接MongoClient 由3种方法可以选择,看使用情况 """ self.client = MongoClient( mod_config.get_config("database", "dbhost"), int(mod_config.get_config("database", "dbport"))) self.db = self.client[mod_config.get_config("database", "dbname")] self.table = self.db[table_name]
class JokeNewsEngine(Document): id_str = StringField(required=True, unique=True) favorite_count = IntField(required=True) # 收藏数 go_detail_count = IntField(required=True) # 点击量 comment_count = IntField(required=True) # 评论数 share_count = IntField(required=True) # 分享数 bury_count = IntField(required=True) # 不喜欢数 digg_count = IntField(required=True) # 喜欢数 online_time = IntField(required=True) # 时间戳 ori_data = DictField(required=True) meta = {'collection': mod_config.get_config('mongodb', 'collection')} @classmethod def create_joke_obj(cls, ori_data): try: cls(id_str=ori_data['group']['id_str'], favorite_count=ori_data['group']['favorite_count'], comment_count=ori_data['group']['comment_count'], go_detail_count=ori_data['group']['go_detail_count'], share_count=ori_data['group']['share_count'], bury_count=ori_data['group']['bury_count'], digg_count=ori_data['group']['digg_count'], online_time=ori_data['online_time'], ori_data=ori_data).save() except Exception as e: print "news object save error: ", e s = str(e) mongoDBControllerErrorMessage = get_logger('mongoDBController.log') mongoDBControllerErrorMessage.error('joke news object save error: ' + s) # 检查此新闻数据库里面是否已经存在 @classmethod def check_joke_obj(cls, ori_data): obj = cls.objects(id_str=ori_data['group']['id_str']).first() if obj: return True else: return False
class RecommendTypedNewsEngine(Document): group_id = StringField(required=True, unique=True) news_tag = StringField(unique=False) # 新闻类型 comment_count = IntField(unique=False) # 评论数 behot_time = IntField(unique=False) # 新闻时间 ori_data = DictField(required=True) # 其余项 meta = {'collection': mod_config.get_config('mongodb', 'collection')} @classmethod def create_regular_obj(cls, ori_data): try: cls(group_id=ori_data['group_id'], news_tag=ori_data['tag'], comment_count=ori_data['comments_count'], behot_time=ori_data['behot_time'], ori_data=ori_data).save() except Exception as e: print "news object save error: ", e s = str(e) mongoDBControllerErrorMessage = get_logger('mongoDBController.log') mongoDBControllerErrorMessage.error('recommend-typed news object save error: ' + s) @classmethod def check_regular_obj(cls, ori_data): obj = cls.objects(group_id=ori_data['group_id']).first() if obj: return True else: return False
def main(): """ 输入想要查询的新闻时间戳和类型 时间戳形式:2017-03-17 20:05:32 新闻类型:'recommend','hot','image','joke','society','entertainment','tech','sports','car','finance','funny' """ hot_timeStamp = mod_config.get_config( 'crawler', 'timeStamp' ) # 输入想要查询的新闻时间戳 '0'表示最新新闻,'2017-03-15 18:23:05'表示从当前时间的新闻一直记录到目标时间点 newsType = mod_config.get_config('crawler', 'newsType') # 输入想要查询的新闻类型 url = TouTiaoNewsURL.getTargetURL(newsType, '0') # 获得目标url print 'url get: ', url """ 获取MongoDB用户端口,连接mongodb """ mongoDataBase = mod_config.get_config('mongodb', 'database_name') mongoHost = mod_config.get_config('mongodb', 'host') mongoPort = mod_config.get_config('mongodb', 'port') """ 连接数据库 """ MongoDBController(mongoDataBase, mongoHost, int(mongoPort)).connectToMongoDB() """ 此处可以调用爬虫api来控制爬取新闻的类型 """ crawler = CrawlerLoopControl() print 'crawler control constructed' if mod_config.get_config('crawler', 'crawlerMode') == 'timestamp': crawler.getTimeStampNewsLooply(hot_timeStamp, newsType, url) elif mod_config.get_config('crawler', 'crawlerMode') == 'newest': crawler.getRecentNewsLooply(newsType, url)
def start_api_tkdata(): WSGIServer(myapp, bindAddress=(mod_config.get_config("server", "server_host"), int(mod_config.get_config("server", "tk_data_port")))).run()