Exemplo n.º 1
0
 def initialize_info(self, user_config):
     """初始化爬虫信息"""
     self.weibo = []
     self.user = UserInfo()
     self.user_config = user_config
     self.got_count = 0
     self.weibo_id_list = []
Exemplo n.º 2
0
def get_user_info():
  """Get UserInfo for currently logged in user.

  This will insert the new user if it does not already exist in datastore.

  Returns:
    UserInfo record for user if user is logged in, else None.
  """
  user = users.get_current_user()
  if user is None:
    return None
  auth_email = user.email()
  effective_email = auth_email

  if auth_email == '*****@*****.**':
    effective_email = '*****@*****.**'
  if auth_email == '*****@*****.**':
    effective_email = '*****@*****.**'

  if auth_email == effective_email:
    ui = UserInfo.get_or_insert(key_name='user:%s' % auth_email)
  else:
    ui = UserInfo.get_by_key_name('user:%s' % effective_email)
    if not ui:
      logging.error("User %s failed to act as %s; %s doesn't exist", auth_email, effective_email, effective_email)
      return None
    logging.info("User %s acting as %s", auth_email, effective_email)
    ui.non_owner = True
    ui.real_email = auth_email
  return ui
Exemplo n.º 3
0
 def get_user_info(self):
     """获取用户信息"""
     params = {'containerid': '100505' + str(self.user_config['user_id'])}
     js = self.get_json(params)
     if js['ok']:
         info = js['data']['userInfo']
         user_info = OrderedDict()
         user_info['user_id'] = self.user_config['user_id']
         user_info['screen_name'] = info.get('screen_name', '')
         user_info['gender'] = "女" if js['data']['userInfo']['gender'] == "f" else "男"
         params = {
             'containerid':
                 '230283' + str(self.user_config['user_id']) + '_-_INFO'
         }
         zh_list = [
             u'生日', u'所在地', u'小学', u'初中', u'高中', u'大学', u'公司', u'注册时间',
             u'阳光信用'
         ]
         en_list = [
             'birthday', 'location', 'education', 'education', 'education',
             'education', 'company', 'registration_time', 'sunshine'
         ]
         for i in en_list:
             user_info[i] = ''
         js = self.get_json(params)
         if js['ok']:
             cards = js['data']['cards']
             if isinstance(cards, list) and len(cards) > 1:
                 card_list = cards[0]['card_group'] + cards[1]['card_group']
                 for card in card_list:
                     if card.get('item_name') in zh_list:
                         user_info[en_list[zh_list.index(
                             card.get('item_name'))]] = card.get(
                             'item_content', '')
         user_info['statuses_count'] = info.get('statuses_count', 0)
         user_info['followers_count'] = info.get('followers_count', 0)
         user_info['follow_count'] = info.get('follow_count', 0)
         user_info['description'] = info.get('description', '')
         user_info['profile_url'] = info.get('profile_url', '')
         user_info['profile_image_url'] = info.get('profile_image_url', '')
         user_info['avatar_hd'] = info.get('avatar_hd', '')
         user_info['urank'] = info.get('urank', 0)
         user_info['mbrank'] = info.get('mbrank', 0)
         user_info['verified'] = info.get('verified', False)
         user_info['verified_type'] = info.get('verified_type', -1)
         user_info['verified_reason'] = info.get('verified_reason', '')
         user = self.standardize_info(user_info)
         self.user = UserInfo(**dict(user))
         self.user_to_database()
         return user
Exemplo n.º 4
0
 def bassCreateUser(self, username, userId):
     user = UserInfo.UserInfo()
     user.name = username
     user.id = userId
     user = user.__dict__
     # 打印字典
     print(user)
     # 字典转化为json
     userJson = json.dumps(user)
     userDao = UserDao.UserDao()
     userDict = userDao.baasCreateUser(userJson)
     userInfo = UserInfo.UserInfo()
     userInfo.userId, userInfo.publicKey, userInfo.privateKey, userInfo.address = userDict[
         "id"], userDict['basePublicKey'], userDict[
             'basePrivateKey'], userDict['baseAccountAddress']
     return userInfo
Exemplo n.º 5
0
def lookup_and_authenticate_user(handler, claimed_email, claimed_password):
    if not claimed_email:
        return None
    claimed_user = UserInfo.get_by_key_name("user:%s" % claimed_email)
    if not claimed_user:
        return None
    if claimed_email == "*****@*****.**" and handler.request.headers["Host"] == "localhost:8080":
        # No auth for testing.
        return claimed_user
    if claimed_user.upload_password and claimed_user.upload_password == claimed_password:
        return claimed_user
    return None
Exemplo n.º 6
0
def get_user_info():
    """Get UserInfo for currently logged in user.

  This will insert the new user if it does not already exist in datastore.

  Returns:
    UserInfo record for user if user is logged in, else None.
  """
    user = users.get_current_user()
    if user is None:
        return None
    else:
        return UserInfo.get_or_insert(key_name='user:%s' % user.email())
Exemplo n.º 7
0
def get_user_info():
    """Get UserInfo for currently logged in user.

  This will insert the new user if it does not already exist in datastore.

  Returns:
    UserInfo record for user if user is logged in, else None.
  """
    user = users.get_current_user()
    if user is None:
        return None
    else:
        return UserInfo.get_or_insert(key_name="user:%s" % user.email())
Exemplo n.º 8
0
def lookup_and_authenticate_user(handler, claimed_email, claimed_password):
  if not claimed_email:
    return None
  claimed_user = UserInfo.get_by_key_name('user:%s' % claimed_email)
  if not claimed_user:
    return None
  if claimed_email == '*****@*****.**' and \
        handler.request.headers["Host"] == "localhost:8080":
    # No auth for testing.
    return claimed_user
  if claimed_user.upload_password and \
        claimed_user.upload_password == claimed_password:
    return claimed_user
  return None
Exemplo n.º 9
0
    def get(self):
        effective_user = None

        claimed_email = self.request.get('user_email')
        if claimed_email:
            claimed_user = UserInfo.get_by_key_name('user:%s' % claimed_email)
            if claimed_user and \
               claimed_user.upload_password and \
               claimed_user.upload_password == self.request.get('password'):
                effective_user = claimed_user

        if effective_user:
            self.response.headers['Content-Type'] = 'text/plain'
            upload_url = blobstore.create_upload_url('/admin/store')
            self.response.out.write(upload_url)
        else:
            self.error(403)
Exemplo n.º 10
0
  def get(self):
    effective_user = None

    claimed_email = self.request.get('user_email')
    if claimed_email:
      claimed_user = UserInfo.get_by_key_name('user:%s' % claimed_email)
      if claimed_user and \
         claimed_user.upload_password and \
         claimed_user.upload_password == self.request.get('password'):
        effective_user = claimed_user

    if effective_user:
      self.response.headers['Content-Type'] = 'text/plain'
      upload_url = blobstore.create_upload_url('/admin/store')
      self.response.out.write(upload_url)
    else:
      self.error(403)
Exemplo n.º 11
0
        def store_media():
            """Store media object info in datastore.

      Also updates the user-info record to keep count of media objects.

      This function is run as a transaction.
      """
            user_info = UserInfo.get_by_key_name("user:%s" % user_email)
            if user_info is None:
                error_messages.append("User record has been deleted.  " "Try uploading again")
                return

            media = MediaObject(
                parent=user_info,
                owner=user_info,
                blob=blob_info.key(),
                creation=blob_info.creation,
                content_type=blob_info.content_type,
                filename=blob_info.filename,
                size=int(blob_info.size),
                lacks_document=True,
            )

            user_info.media_objects += 1
            db.put(user_info)
            db.put(media)

            if bool(is_doc) and is_doc != "0":
                tag_list = []
                if tags is not None:
                    tag_list = [x for x in re.split("\s*,\s*", tags) if x]

                doc = Document(
                    parent=user_info,
                    owner=user_info,
                    pages=[media.key()],
                    title=title,
                    description=description,
                    no_tags=(len(tag_list) == 0),
                    tags=tag_list,
                )
                db.put(doc)
                media.document = doc.key()
                media.lacks_document = False
                db.put(media)
Exemplo n.º 12
0
  def get(self):
    self.response.headers['Cache-Control'] = "private"
    self.response.headers['Content-Type'] = "text/plain; charset=utf-8"

    user = UserInfo.get_by_key_name('user:[email protected]')

    docs = Document.all().filter('owner', user)
    docs = docs.fetch(10000)
    self.response.out.write("# got %d docs\n" % len(docs))
    for doc in docs:
      self.response.out.write("%s tags[%s] date[%s] title[%s] \n" % (doc.display_url, doc.tag_comma_separated, doc.date_yyyy_mm_dd, doc.title_or_empty_string))
      for page in doc.pages:
        self.response.out.write(" has_page: %d\n" % (page.id_or_name()))
    meds = MediaObject.all().filter('owner', user)
    meds = meds.fetch(10000)
    self.response.out.write("# got %d mediaobjects\n" % len(meds))
    for mo in meds:
      self.response.out.write("%s creation[%s] size[%d]\n" % (mo.url_path, str(mo.creation), mo.size))
Exemplo n.º 13
0
    def store_media():
      """Store media object info in datastore.

      Also updates the user-info record to keep count of media objects.

      This function is run as a transaction.
      """
      user_info = UserInfo.get_by_key_name('user:%s' % user_email)
      if user_info is None:
        error_messages.append('User record has been deleted.  '
                              'Try uploading again')
        return

      media = MediaObject(
          parent=user_info,
          owner=user_info,
          blob=blob_info.key(),
          creation=blob_info.creation,
          content_type=blob_info.content_type,
          filename=blob_info.filename,
          size=int(blob_info.size),
          lacks_document=True)

      user_info.media_objects += 1
      db.put(user_info)
      db.put(media)

      if bool(is_doc) and is_doc != "0":
        tag_list = []
        if tags is not None:
          tag_list = [x for x in re.split('\s*,\s*', tags) if x]

        doc = Document(
            parent=user_info,
            owner=user_info,
            pages=[media.key()],
            title=title,
            description=description,
            no_tags=(len(tag_list)==0),
            tags=tag_list)
        db.put(doc)
        media.document = doc.key()
        media.lacks_document = False
        db.put(media)
Exemplo n.º 14
0
 def findUserInfoByAddress(self, address):
     sql = """
                    select id,name,address,publicKey,privateKey from user a left join user_info b  on a.id = b.userId  where address = %s
                """
     array = []
     param = []
     param.append(address)
     try:
         baseDao = BaseDao.BaseDao()
         result = baseDao.execteGetOneSql(sql, param)
         user = UserInfo.UserInfo()
         user.id = result[0]
         user.name = result[1]
         user.address = result[2]
         user.privateKey = result[4]
         user.publicKey = result[3]
         return user
     except Exception as e:
         print(e)
Exemplo n.º 15
0
def user_info():
	"""
		Grab user's profile information into the database
		user sign up page, user must log in successfully before getting to this page
	"""

	if not is_loggedin():
		return redirect(url_for('login'))
	form=UserInfoForm()
	if request.method=='POST':
		userInfo=UserInfo(form.nickName.data, form.email.data, form.phone.data, form.city.data, 
			form.state.data, form.zipcode.data, form.education.data, form.sports.data,form.arts.data,
			form.travel.data,form.music.data,form.reading.data,form.gardening.data, form.nature.data,
			form.snowboard.data,form.food.data)
		db.session.add(userInfo) #add returned data to user table in the database 
		db.session.commit()
		return redirect(url_for('home'))

	elif request.method=='GET':
		return render_template('user_info.html',form=form)
Exemplo n.º 16
0
    def getBossAccount(self, customerName):
        sql = """
                       select id,name,address,publicKey,privateKey from user a left join user_info b  on a.id = b.userId  where customerName = %s
                        and b.isBoss=0
                   """
        array = []
        param = []
        param.append(customerName)
        try:
            baseDao = BaseDao.BaseDao()
            result = baseDao.execteGetOneSql(sql, param)
            user = UserInfo.UserInfo()
            user.id = result[0]
            user.name = result[1]
            user.address = result[2]
            user.privateKey = result[4]
            user.publicKey = result[3]

            return user
        except Exception as e:
            print(e)
Exemplo n.º 17
0
async def join(users: UserI):
    userlist=list(users)
    user_id = userlist[1][1]
    user_pw = userlist[2][1]
    user_name = userlist[3][1]
    user_age = userlist[4][1]
    

    user = UserInfo()
    user.user_id = user_id
    user.user_pw = user_pw
    user.user_name = user_name
    user.user_age = user_age

    session.add(user)
    session.commit()
    return "회원가입완료!"
Exemplo n.º 18
0
    def findUserInfo(self, customerName):
        sql = """
               select id,name,address,publicKey,privateKey from user a left join user_info b  on a.id = b.userId  where customerName = %s
                and b.isBoss=1
           """
        array = []
        param = []
        param.append(customerName)
        try:

            result = baseDao.executeGetAllSql(sql, param)

            for row in result:
                user = UserInfo.UserInfo()
                user.id = row[0]
                user.name = row[1]
                user.address = row[2]
                user.privateKey = row[4]
                user.publicKey = row[3]
                array.append(user)
            return array
        except Exception as e:
            print(e)
Exemplo n.º 19
0
def settleTest(address,amount):
    userController = NewUserController.NewUserController()
    userInfo = UserInfo.UserInfo()

    list=[]
    settleObject =NewSettleObject.NewSettleObject()
    settleObject.amount = amount
    settleObject.ownerAccount = address

    userInfo = userController.findUserInfoByAddress(settleObject.ownerAccount)
    list.append(userInfo)
    settleObject.userPrivateKey = userInfo.privateKey
    userAssetArray = userController.findAssetId(userInfo)
    userAssetStr = ""
    first= True
    for each in userAssetArray:
        if first:
            userAssetStr = userAssetStr+each.assetAddress
            first=False
        else:
            userAssetStr = userAssetStr+"," + each.assetAddress
    settleObject.srcAsset = userAssetStr
    userController.settle(settleObject)
Exemplo n.º 20
0
    def get(self):
        self.response.headers['Cache-Control'] = "private"
        self.response.headers['Content-Type'] = "text/plain; charset=utf-8"

        user = UserInfo.get_by_key_name('user:[email protected]')

        docs = Document.all().filter('owner', user)
        docs = docs.fetch(10000)
        self.response.out.write("# got %d docs\n" % len(docs))
        for doc in docs:
            self.response.out.write(
                "%s tags[%s] date[%s] title[%s] \n" %
                (doc.display_url, doc.tag_comma_separated, doc.date_yyyy_mm_dd,
                 doc.title_or_empty_string))
            for page in doc.pages:
                self.response.out.write(" has_page: %d\n" %
                                        (page.id_or_name()))
        meds = MediaObject.all().filter('owner', user)
        meds = meds.fetch(10000)
        self.response.out.write("# got %d mediaobjects\n" % len(meds))
        for mo in meds:
            self.response.out.write("%s creation[%s] size[%d]\n" %
                                    (mo.url_path, str(mo.creation), mo.size))
Exemplo n.º 21
0
    def get(self):
        self.response.headers['Content-Type'] = 'text/html'

        user = users.get_current_user()
        current_folder = self.request.get('current_folder')
        folder = None
        if user:
            user_key = ndb.Key('UserInfo', user.email())
            user_info = user_key.get()
            if user_info is None:
                user_info = UserInfo(id=user.email())
                user_info.email = user.email()
                folder_info = FolderInfo(id=user.email() + "/")
                folder_info.name = "/"
                user_info.folder = folder_info.key
                folder_info.put()
                user_info.put()
                time.sleep(1)

            if len(current_folder) == 0:
                folder = user_info.folder.get()
            else:
                folder_key = ndb.Key('FolderInfo',
                                     user.email() + current_folder)
                folder = folder_key.get()

            url = users.create_logout_url(self.request.uri)
            url_string = 'logout'
        else:
            url = users.create_login_url(self.request.uri)
            url_string = 'login'

        template_values = {
            'url': url,
            'url_string': url_string,
            'user': user,
            'upload_url': blobstore.create_upload_url('/upload'),
            'message': self.request.get('message'),
            'current_folder': folder
        }

        template = JINJA_ENVIRONMENT.get_template('main.html')
        self.response.write(template.render(template_values))
        MainPage.obj = self
Exemplo n.º 22
0
 def __init__(self, user_id_list, config=None):
     """Weibo类初始化"""
     if not config:
         config = weibo_config
     config['user_id_list'] = user_id_list
     self.validate_config(config)
     self.filter = config[
         'filter']  # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博
     since_date = config['since_date']
     if isinstance(since_date, int):
         since_date = date.today() - timedelta(since_date)
     since_date = str(since_date)
     self.since_date = since_date  # 起始时间,即爬取发布日期从该值到现在的微博,形式为yyyy-mm-dd
     self.start_page = config.get('start_page',
                                  1)  # 开始爬的页,如果中途被限制而结束可以用此定义开始页码
     self.write_mode = config[
         'write_mode']  # 结果信息保存类型,为list形式,可包含csv、mongo和mysql三种类型
     self.original_pic_download = config[
         'original_pic_download']  # 取值范围为0、1, 0代表不下载原创微博图片,1代表下载
     self.retweet_pic_download = config[
         'retweet_pic_download']  # 取值范围为0、1, 0代表不下载转发微博图片,1代表下载
     self.original_video_download = config[
         'original_video_download']  # 取值范围为0、1, 0代表不下载原创微博视频,1代表下载
     self.retweet_video_download = config[
         'retweet_video_download']  # 取值范围为0、1, 0代表不下载转发微博视频,1代表下载
     self.result_dir_name = config.get(
         'result_dir_name', 0)  # 结果目录名,取值为0或1,决定结果文件存储在用户昵称文件夹里还是用户id文件夹里
     cookie = config.get('cookie')  # 微博cookie,可填可不填
     user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
     self.headers = {'User_Agent': user_agent, 'Cookie': cookie}
     self.mysql_config = config.get('mysql_config')  # MySQL数据库连接配置,可以不填
     query_list = config.get('query_list') or []
     if isinstance(query_list, str):
         query_list = query_list.split(',')
     self.query_list = query_list
     if not isinstance(user_id_list, list):
         if not os.path.isabs(user_id_list):
             user_id_list = os.path.split(
                 os.path.realpath(__file__))[0] + os.sep + user_id_list
         self.user_config_file_path = user_id_list  # 用户配置文件路径
         user_config_list = self.get_user_config_list(user_id_list)
     else:
         self.user_config_file_path = ''
         user_config_list = [{
             'user_id': user_id,
             'since_date': self.since_date,
             'query_list': query_list
         } for user_id in user_id_list]
     self.user_config_list = user_config_list  # 要爬取的微博用户的user_config列表
     self.user_config = {}  # 用户配置,包含用户id和since_date
     self.start_date = ''  # 获取用户第一条微博时的日期
     self.query = ''
     self.user: UserInfo = UserInfo()  # 存储目标微博用户信息
     self.got_count = 0  # 存储爬取到的微博数
     self.weibo = []  # 存储爬取到的所有微博信息
     self.weibo_id_list = []  # 存储爬取到的所有微博id
     self.proxies = requests.get(
         "http://api.hailiangip.com:8422/api/getIp?type=1&num=1&pid=&unbindTime=600&cid=&orderId=O21042810412537647150&time=1619577728&sign=d79086f5b8ba9dbe1a17e5b710b77032&noDuplicate=1&dataType=0&lineSeparator=0&singleIp="
     ).json()['data'][0]
     logger.info({
         "http": f"http://{self.proxies['ip']}:{self.proxies['port']}",
         "https": f"https://{self.proxies['ip']}:{self.proxies['port']}",
     })
Exemplo n.º 23
0
 def get(self):
     
     #=======================================================================?
     # ** create dic, and then compare to session. If different, rewrite session
     # and UserInfo. If not, return session! Saves time if button was hit 
     # without updating! 
     #=======================================================================?
     fs_id = self.request.get('fs_id')
     
     access_token = self.request.get('access_token')
     
     if self.request.get('reset'):
         #reset to defaults... (possibly store these in UserInfo)
         
         self.store_user(access_token, reset=True)
     
     
     logging.info('*** fs_id = {} ***'.format(fs_id))
     homes = self.request.get_all('homes')
     charities = self.request.get_all('charities')
     latlon = self.request.get('latlon')
     home_prefs = []
     char_prefs = []
     
     robot_pref = self.request.get('robot_pref')
     
     user = UserInfo().all().filter('fs_id =', fs_id).get()
     #logging.info('*** user = {} ***'.format(user))
     prefs = json.loads(user.prefs)
     prefs['latlon'] = latlon # for debuggin, shouldn't need to change this...
     
     if homes:
         for home in homes:
             h_tup = home.split('|||')
             home_prefs.append(h_tup)
         
         prefs['homes'] = home_prefs
     
     if charities:
         for charity in charities:
             c_tup = charity.split('|||')
             
             char_prefs.append(c_tup)
         prefs['charities'] = char_prefs
     
     if robot_pref:
         friends = self.request.get('friends')
         if robot_pref == 'robot':
             #here we add a function to friend the robot if not already friends!
             
             if friends == 'no':
                 
                 friend = utils.friend_the_robot(access_token, fs_id)
                 #returns True or False if the friending worked!
                 if friend:
                     prefs['friends_with_ond'] = 'yes'
             
             prefs['robot_posts'] = True
         
         else:
             prefs['robot_posts'] = False
     
     udic = self.session.get('user')
     udic['prefs'] = prefs
     
     #Store 3-ways!
     self.session['user'] = udic
     memcache.set('user_' + fs_id, udic)
     
     pref_dump = json.dumps(prefs)
     
     user.prefs = pref_dump
     user.put()
     
     self.write(pref_dump)
Exemplo n.º 24
0
 def store_user(self, access_token, reset=False): #we might need something that 
     #=======================================================================
     # Function for storing user on first oauth. Oauth will check to see if 
     # user is in db, and if not, store them. This should happen ONLY once
     # so as not to overwrite set prefs! We'll use Memcache AND Session to 
     # Store! Update will update access token in all params! (??)
     #=======================================================================
     curr_client = utils.makeFoursquareClient(access_token)
     current_user = curr_client.users()
             
     if current_user:
         # Not an existing user so get user info
         # Store fs_id, token and prefs!
         profile = current_user['user']
         fs_id = profile["id"]
         
         existing_user_info = UserInfo.get_by_fs_id(fs_id)
         if existing_user_info and not reset:
             # User in DB we'll just update his/her access token!
             logging.info('*** There was an existing user with fs_id = {} ***'.format(fs_id))
             user = existing_user_info
             user.token = access_token
             prefs = json.loads(user.prefs)
             
             if not prefs.get('name'):
                 prefs['name'] = profile['firstName']
                 prefs['gender'] = profile['gender']
                 if not prefs.get('latlon'):
                     prefs['latlon'] = utils.get_latlon(current_user)
                 user.prefs = json.dumps(prefs)
         
         elif existing_user_info and reset:
             #user in db, but we want to reset to default prefs
             user = existing_user_info
             prefs = utils.make_default_prefs(curr_client,current_user)
             user.transactions = json.dumps([])
             user.prefs = json.dumps(prefs)
             user.token = access_token
                   
             
         else:
             logging.info('*** Creating a new user for fs_id = {} ***'.format(fs_id))
             user = UserInfo(fs_id = fs_id,
                             token = access_token)
             # store default prefs in user that can be reset later!
             prefs = utils.make_default_prefs(curr_client, current_user)
             
             
             user.transactions = json.dumps([])
             user.prefs = json.dumps(prefs)
             
         user.put() #make new user or update token of existing user!
         logging.info('****Added or updated User {} to DataBase!****'.format(user.fs_id))
         
         # Now store in Memcache and Session for later retrieval!
         udic = dict(fs_id = user.fs_id,
                     access_token = user.token,
                     gender=profile['gender'],
                     prefs = prefs)
         
         self.session["user"] = udic
         memcache.set('user_' + user.fs_id, udic)
         
     
     return self.session.get("user")
Exemplo n.º 25
0
    def get(self):
        # # to be rendered from UserInfo
        # # poss add param of rank to tuples? To sort by most used?
        sets = ''
#         homenow = ''
#         setprefs = ''
        check_session = ''
        check_store_user = ''
        reset_user = ''
        logout_user = ''
        trivtest = ' '
        transtest = ''
        
        #this tests set-info page!
        if sets:
            user = self.session.get('user')
            content = {'its_a_bar' : True}
            prefs = user['prefs']
            content.update(prefs)
            self.render('set-info.html', **content)
        
        #Test trivia game page!
        elif trivtest:
            content = DEF_CONTENT
            
            #this happens when user checks in to home!
            transaction = utils.create_transaction(content)
            self.update_transaction(transaction, activate= True)
            self.store_curr_transaction(transaction, db= True)
            
            
            ## *** Will need to do this on before home-now post! ***
            content['trivia_url'] = transaction['trivia_url']
            content['trans_id'] = transaction['trans_id']
            
            self.render('home-now.html', **content)
        
        #Test transaction functions on real database!
        elif transtest:
            content = DEF_CONTENT
            user = self.fetchUserInfo(content['fs_id'])
            transaction = json.loads(user.transaction)
        
        #Logs out user so they can go through process again!
        elif logout_user:
            self.logout()
            self.write('You are logged out!<br><br><br>')
            self.write('Why not go <a href="/">HERE</a> now?')
        
        #Tests if Store User worked!
        elif check_store_user:
            access_token = '0T0ETAYAS3ZET51DJW01U1LFBSVMLF0BCJ3ONWINO3YVEWRX'
            udic = self.store_user(access_token)
            self.write('udic = <br><br>')
            self.write(udic)
            self.write('<br><br><br>')
            self.write('session_user = <br><br>')
            self.write(self.session.get('user'))
            self.write('<br><br><br>')
            self.write('memcached_user = <br><br>')
            self.write(memcache.get('user_' + udic['fs_id']))
            
        #resets a user's prefs
        elif reset_user:
            access_token = '0T0ETAYAS3ZET51DJW01U1LFBSVMLF0BCJ3ONWINO3YVEWRX'
            udic = self.store_user(access_token, reset=True)
            self.write('user reset!<br>')
            self.write('udic = <br><br>')
            self.write(udic)
            self.write('<br><br><br>')
            self.write('session_user = <br><br>')
            self.write(self.session.get('user'))
            self.write('<br><br><br>')
            self.write('memcached_user = <br><br>')
            self.write(memcache.get('user_' + udic['fs_id']))
        
        #compares session to user!
        elif check_session:
            #see what's in my session cookie!
            user = self.current_user
            
            user_info = UserInfo.all().filter('fs_id = ','4091108').get()
                        
            self.write('user = <br><br>')
            self.write(user)
            self.write('<br><br><br>')
            self.write('user_info.prefs = <br><br>')
            self.write(user_info.prefs)
           
#         elif homenow:
#             content = {"human_time": "4:55:50 PM",
#                        "human_wager": "$420",
#                        "charity_id": "23-90876",
#                        "pronoun": "he",
#                        "then": "1366836950018",
#                        "home_id": "4d60a5e4865a224bdd32ae85",
#                        "charity": "The Creation Museum",
#                        "its_a_bar": True,
#                        "made_it": "y",
#                        "home": "Waterphone Of Dreams (S&T's)",
#                        "name": "Scott",
#                        "now": "1366661436381",
#                        "wager": "420"}
#             
#             # right_now = time.time()
#             
#             
#             self.render('home-now.html', **content)
#         elif setprefs:
#             user = UserInfo()
#             user.fs_id = '4091108'
#             user.prefs = json.dumps({'homes' : [] , 'charities' : []})
#             user.token = "0T0ETAYAS3ZET51DJW01U1LFBSVMLF0BCJ3ONWINO3YVEWRX"
#             user.put()
#             
#             
#             self.write(user.fs_id)
        else:
            #self.logout() ##uncomment to debug! (ie set new prefs!)
            user = self.current_user
            
            #===================================================================
            # can put in an authed param to speed up the render!
            # 
            #===================================================================
            
            if not user:
                #udic = me! (for testing) Will update handling to create say['authed = false']
                udic = dict(name = 'Scott',
                            fs_id = '4091108',
                            access_token = "0T0ETAYAS3ZET51DJW01U1LFBSVMLF0BCJ3ONWINO3YVEWRX",
                            gender = 'male',
                            prefs = PREFS)
                self.session["user"] = udic
                logging.info("*** Set a User! ***")
                user = udic
            client_id = CONFIG['client_id']
            params = {'client_id': client_id}
            params['auth_url'] = utils.generateFoursquareAuthUri(client_id)
            params['site_name'] = CONFIG['site_name']
            params['description'] = CONFIG['site_description']
            params['fs_id'] = user['fs_id']
            params.update(user['prefs'])
            params['bad_charities'] = BAD_CHARITIES
            
            #This will be added and set to false if not self.current_user!
            params['authed'] = 'true'
            
            self.render('index.html', **params)
Exemplo n.º 26
0
class Weibo(object):
    def __init__(self, user_id_list, config=None):
        """Weibo类初始化"""
        if not config:
            config = weibo_config
        config['user_id_list'] = user_id_list
        self.validate_config(config)
        self.filter = config[
            'filter']  # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博
        since_date = config['since_date']
        if isinstance(since_date, int):
            since_date = date.today() - timedelta(since_date)
        since_date = str(since_date)
        self.since_date = since_date  # 起始时间,即爬取发布日期从该值到现在的微博,形式为yyyy-mm-dd
        self.start_page = config.get('start_page',
                                     1)  # 开始爬的页,如果中途被限制而结束可以用此定义开始页码
        self.write_mode = config[
            'write_mode']  # 结果信息保存类型,为list形式,可包含csv、mongo和mysql三种类型
        self.original_pic_download = config[
            'original_pic_download']  # 取值范围为0、1, 0代表不下载原创微博图片,1代表下载
        self.retweet_pic_download = config[
            'retweet_pic_download']  # 取值范围为0、1, 0代表不下载转发微博图片,1代表下载
        self.original_video_download = config[
            'original_video_download']  # 取值范围为0、1, 0代表不下载原创微博视频,1代表下载
        self.retweet_video_download = config[
            'retweet_video_download']  # 取值范围为0、1, 0代表不下载转发微博视频,1代表下载
        self.result_dir_name = config.get(
            'result_dir_name', 0)  # 结果目录名,取值为0或1,决定结果文件存储在用户昵称文件夹里还是用户id文件夹里
        cookie = config.get('cookie')  # 微博cookie,可填可不填
        user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
        self.headers = {'User_Agent': user_agent, 'Cookie': cookie}
        self.mysql_config = config.get('mysql_config')  # MySQL数据库连接配置,可以不填
        query_list = config.get('query_list') or []
        if isinstance(query_list, str):
            query_list = query_list.split(',')
        self.query_list = query_list
        if not isinstance(user_id_list, list):
            if not os.path.isabs(user_id_list):
                user_id_list = os.path.split(
                    os.path.realpath(__file__))[0] + os.sep + user_id_list
            self.user_config_file_path = user_id_list  # 用户配置文件路径
            user_config_list = self.get_user_config_list(user_id_list)
        else:
            self.user_config_file_path = ''
            user_config_list = [{
                'user_id': user_id,
                'since_date': self.since_date,
                'query_list': query_list
            } for user_id in user_id_list]
        self.user_config_list = user_config_list  # 要爬取的微博用户的user_config列表
        self.user_config = {}  # 用户配置,包含用户id和since_date
        self.start_date = ''  # 获取用户第一条微博时的日期
        self.query = ''
        self.user: UserInfo = UserInfo()  # 存储目标微博用户信息
        self.got_count = 0  # 存储爬取到的微博数
        self.weibo = []  # 存储爬取到的所有微博信息
        self.weibo_id_list = []  # 存储爬取到的所有微博id
        self.proxies = requests.get(
            "http://api.hailiangip.com:8422/api/getIp?type=1&num=1&pid=&unbindTime=600&cid=&orderId=O21042810412537647150&time=1619577728&sign=d79086f5b8ba9dbe1a17e5b710b77032&noDuplicate=1&dataType=0&lineSeparator=0&singleIp="
        ).json()['data'][0]
        logger.info({
            "http": f"http://{self.proxies['ip']}:{self.proxies['port']}",
            "https": f"https://{self.proxies['ip']}:{self.proxies['port']}",
        })

    def validate_config(self, config):
        """验证配置是否正确"""
        # 验证filter、original_pic_download、retweet_pic_download、original_video_download、retweet_video_download
        argument_list = [
            'filter', 'original_pic_download', 'retweet_pic_download',
            'original_video_download', 'retweet_video_download'
        ]
        for argument in argument_list:
            if config[argument] != 0 and config[argument] != 1:
                logger.warning(u'%s值应为0或1,请重新输入', config[argument])
                sys.exit()

        # 验证since_date
        since_date = config['since_date']
        if (not self.is_date(str(since_date))) and (not isinstance(
                since_date, int)):
            logger.warning(u'since_date值应为yyyy-mm-dd形式或整数,请重新输入')
            sys.exit()

        # 验证query_list
        query_list = config.get('query_list') or []
        if (not isinstance(query_list, list)) and (not isinstance(
                query_list, str)):
            logger.warning(u'query_list值应为list类型或字符串,请重新输入')
            sys.exit()

        # 验证write_mode
        write_mode = ['csv', 'json', 'mongo', 'mysql']
        if not isinstance(config['write_mode'], list):
            sys.exit(u'write_mode值应为list类型')
        for mode in config['write_mode']:
            if mode not in write_mode:
                logger.warning(
                    u'%s为无效模式,请从csv、json、mongo和mysql中挑选一个或多个作为write_mode',
                    mode)
                sys.exit()

        # 验证user_id_list
        user_id_list = config['user_id_list']
        if (not isinstance(user_id_list,
                           list)) and (not user_id_list.endswith('.txt')):
            logger.warning(u'user_id_list值应为list类型或txt文件路径')
            sys.exit()
        if not isinstance(user_id_list, list):
            if not os.path.isabs(user_id_list):
                user_id_list = os.path.split(
                    os.path.realpath(__file__))[0] + os.sep + user_id_list
            if not os.path.isfile(user_id_list):
                logger.warning(u'不存在%s文件', user_id_list)
                sys.exit()

    def is_date(self, since_date):
        """判断日期格式是否正确"""
        try:
            datetime.strptime(since_date, '%Y-%m-%d')
            return True
        except ValueError:
            return False

    def get_json(self, params):
        """获取网页中json数据"""
        url = 'https://m.weibo.cn/api/container/getIndex?'
        r = requests.get(url,
                         params=params,
                         headers=self.headers,
                         proxies={
                             "http": f"http://{self.proxies['ip']}:{self.proxies['port']}",
                             "https": f"https://{self.proxies['ip']}:{self.proxies['port']}",
                         },
                         verify=False)
        return r.json()

    def get_weibo_json(self, page):
        """获取网页中微博json数据"""
        params = {
            'container_ext': 'profile_uid:' + str(self.user_config['user_id']),
            'containerid': '100103type=401&q=' + self.query,
            'page_type': 'searchall'
        } if self.query else {
            'containerid': '107603' + str(self.user_config['user_id'])
        }
        params['page'] = page
        js = self.get_json(params)
        return js

    def user_to_database(self):
        """将用户信息写入文件/数据库"""
        hbase.check_create_table("user", {"info": {}})
        hbase.update("user", self.user.user_id, {"info": self.user.dict()})

    def get_user_info(self):
        """获取用户信息"""
        params = {'containerid': '100505' + str(self.user_config['user_id'])}
        js = self.get_json(params)
        if js['ok']:
            info = js['data']['userInfo']
            user_info = OrderedDict()
            user_info['user_id'] = self.user_config['user_id']
            user_info['screen_name'] = info.get('screen_name', '')
            user_info['gender'] = "女" if js['data']['userInfo']['gender'] == "f" else "男"
            params = {
                'containerid':
                    '230283' + str(self.user_config['user_id']) + '_-_INFO'
            }
            zh_list = [
                u'生日', u'所在地', u'小学', u'初中', u'高中', u'大学', u'公司', u'注册时间',
                u'阳光信用'
            ]
            en_list = [
                'birthday', 'location', 'education', 'education', 'education',
                'education', 'company', 'registration_time', 'sunshine'
            ]
            for i in en_list:
                user_info[i] = ''
            js = self.get_json(params)
            if js['ok']:
                cards = js['data']['cards']
                if isinstance(cards, list) and len(cards) > 1:
                    card_list = cards[0]['card_group'] + cards[1]['card_group']
                    for card in card_list:
                        if card.get('item_name') in zh_list:
                            user_info[en_list[zh_list.index(
                                card.get('item_name'))]] = card.get(
                                'item_content', '')
            user_info['statuses_count'] = info.get('statuses_count', 0)
            user_info['followers_count'] = info.get('followers_count', 0)
            user_info['follow_count'] = info.get('follow_count', 0)
            user_info['description'] = info.get('description', '')
            user_info['profile_url'] = info.get('profile_url', '')
            user_info['profile_image_url'] = info.get('profile_image_url', '')
            user_info['avatar_hd'] = info.get('avatar_hd', '')
            user_info['urank'] = info.get('urank', 0)
            user_info['mbrank'] = info.get('mbrank', 0)
            user_info['verified'] = info.get('verified', False)
            user_info['verified_type'] = info.get('verified_type', -1)
            user_info['verified_reason'] = info.get('verified_reason', '')
            user = self.standardize_info(user_info)
            self.user = UserInfo(**dict(user))
            self.user_to_database()
            return user

    def get_long_weibo(self, id):
        """获取长微博"""
        for i in range(5):
            url = 'https://m.weibo.cn/detail/%s' % id
            html = requests.get(url, headers=self.headers, verify=False).text
            html = html[html.find('"status":'):]
            html = html[:html.rfind('"hotScheme"')]
            html = html[:html.rfind(',')]
            html = '{' + html + '}'
            js = json.loads(html, strict=False)
            weibo_info = js.get('status')
            if weibo_info:
                weibo = self.parse_weibo(weibo_info)
                return weibo
            sleep(random.randint(6, 10))

    def get_pics(self, weibo_info):
        """获取微博原始图片url"""
        if weibo_info.get('pics'):
            pic_info = weibo_info['pics']
            pic_list = [pic['large']['url'] for pic in pic_info]
            pics = ','.join(pic_list)
        else:
            pics = ''
        return pics

    def get_live_photo(self, weibo_info):
        """获取live photo中的视频url"""
        live_photo_list = []
        live_photo = weibo_info.get('pic_video')
        if live_photo:
            prefix = 'https://video.weibo.com/media/play?livephoto=//us.sinaimg.cn/'
            for i in live_photo.split(','):
                if len(i.split(':')) == 2:
                    url = prefix + i.split(':')[1] + '.mov'
                    live_photo_list.append(url)
            return live_photo_list

    def get_video_url(self, weibo_info):
        """获取微博视频url"""
        video_url = ''
        video_url_list = []
        if weibo_info.get('page_info'):
            if ((weibo_info['page_info'].get('urls')
                 or weibo_info['page_info'].get('media_info'))
                    and weibo_info['page_info'].get('type') == 'video'):
                media_info = weibo_info['page_info']['urls']
                if not media_info:
                    media_info = weibo_info['page_info']['media_info']
                video_url = media_info.get('mp4_720p_mp4')
                if not video_url:
                    video_url = media_info.get('mp4_hd_url')
                if not video_url:
                    video_url = media_info.get('hevc_mp4_hd')
                if not video_url:
                    video_url = media_info.get('mp4_sd_url')
                if not video_url:
                    video_url = media_info.get('mp4_ld_mp4')
                if not video_url:
                    video_url = media_info.get('stream_url_hd')
                if not video_url:
                    video_url = media_info.get('stream_url')
        if video_url:
            video_url_list.append(video_url)
        live_photo_list = self.get_live_photo(weibo_info)
        if live_photo_list:
            video_url_list += live_photo_list
        return ';'.join(video_url_list)

    def download_one_file(self, url, file_name, type, weibo_id):
        """下载单个文件(图片/视频)"""
        hbase.update("weibo", str(weibo_id), {"img" if type == "img" else "video": {file_name: url}})

    def handle_download(self, file_type, file_dir, urls, w):
        """处理下载相关操作"""
        file_prefix = w['created_at'][:11].replace('-', '') + '_' + str(
            w['id'])
        if file_type == 'img':
            if ',' in urls:
                url_list = urls.split(',')
                for i, url in enumerate(url_list):
                    index = url.rfind('.')
                    if len(url) - index >= 5:
                        file_suffix = '.jpg'
                    else:
                        file_suffix = url[index:]
                    file_name = file_prefix + '_' + str(i + 1) + file_suffix
                    self.download_one_file(url, file_name, file_type, w['id'])
            else:
                index = urls.rfind('.')
                if len(urls) - index > 5:
                    file_suffix = '.jpg'
                else:
                    file_suffix = urls[index:]
                file_name = file_prefix + file_suffix
                self.download_one_file(urls, file_name, file_type, w['id'])
        else:
            file_suffix = '.mp4'
            if ';' in urls:
                url_list = urls.split(';')
                if url_list[0].endswith('.mov'):
                    file_suffix = '.mov'
                for i, url in enumerate(url_list):
                    file_name = file_prefix + '_' + str(i + 1) + file_suffix
                    self.download_one_file(url, file_name, file_type, w['id'])
            else:
                if urls.endswith('.mov'):
                    file_suffix = '.mov'
                file_name = file_prefix + file_suffix
                self.download_one_file(urls, file_name, file_type, w['id'])

    def download_files(self, file_type, weibo_type, wrote_count):
        """下载文件(图片/视频)"""
        try:
            describe = ''
            if file_type == 'img':
                describe = u'图片'
                key = 'pics'
            else:
                describe = u'视频'
                key = 'video_url'
            if weibo_type == 'original':
                describe = u'原创微博' + describe
            else:
                describe = u'转发微博' + describe
            logger.info(u'即将进行%s下载', describe)
            file_dir = self.get_filepath(file_type)
            file_dir = file_dir + os.sep + describe
            if not os.path.isdir(file_dir):
                os.makedirs(file_dir)
            for w in self.weibo[wrote_count:]:
                if weibo_type == 'retweet':
                    if w.get('retweet'):
                        w = w['retweet']
                    else:
                        continue
                if w.get(key):
                    self.handle_download(file_type, file_dir, w.get(key), w)
            logger.info(u'%s下载完毕,保存路径:', describe)
            logger.info(file_dir)
        except Exception as e:
            logger.exception(e)

    def get_location(self, selector):
        """获取微博发布位置"""
        location_icon = 'timeline_card_small_location_default.png'
        span_list = selector.xpath('//span')
        location = ''
        for i, span in enumerate(span_list):
            if span.xpath('img/@src'):
                if location_icon in span.xpath('img/@src')[0]:
                    location = span_list[i + 1].xpath('string(.)')
                    break
        return location

    def get_article_url(self, selector):
        """获取微博中头条文章的url"""
        article_url = ''
        text = selector.xpath('string(.)')
        if text.startswith(u'发布了头条文章'):
            url = selector.xpath('//a/@data-url')
            if url and url[0].startswith('http://t.cn'):
                article_url = url[0]
        return article_url

    def get_topics(self, selector):
        """获取参与的微博话题"""
        span_list = selector.xpath("//span[@class='surl-text']")
        topics = ''
        topic_list = []
        for span in span_list:
            text = span.xpath('string(.)')
            if len(text) > 2 and text[0] == '#' and text[-1] == '#':
                topic_list.append(text[1:-1])
        if topic_list:
            topics = ','.join(topic_list)
        return topics

    def get_at_users(self, selector):
        """获取@用户"""
        a_list = selector.xpath('//a')
        at_users = ''
        at_list = []
        for a in a_list:
            if '@' + a.xpath('@href')[0][3:] == a.xpath('string(.)'):
                at_list.append(a.xpath('string(.)')[1:])
        if at_list:
            at_users = ','.join(at_list)
        return at_users

    def string_to_int(self, string):
        """字符串转换为整数"""
        if isinstance(string, int):
            return string
        elif string.endswith(u'万+'):
            string = int(string[:-2] + '0000')
        elif string.endswith(u'万'):
            string = int(string[:-1] + '0000')
        return int(string)

    def standardize_date(self, created_at):
        """标准化微博发布时间"""
        if u'刚刚' in created_at:
            created_at = datetime.now().strftime('%Y-%m-%d')
        elif u'分钟' in created_at:
            minute = created_at[:created_at.find(u'分钟')]
            minute = timedelta(minutes=int(minute))
            created_at = (datetime.now() - minute).strftime('%Y-%m-%d')
        elif u'小时' in created_at:
            hour = created_at[:created_at.find(u'小时')]
            hour = timedelta(hours=int(hour))
            created_at = (datetime.now() - hour).strftime('%Y-%m-%d')
        elif u'昨天' in created_at:
            day = timedelta(days=1)
            created_at = (datetime.now() - day).strftime('%Y-%m-%d')
        else:
            created_at = created_at.replace('+0800 ', '')
            temp = datetime.strptime(created_at, '%c')
            created_at = datetime.strftime(temp, '%Y-%m-%d')
        return created_at

    def standardize_info(self, weibo):
        """标准化信息,去除乱码"""
        # for k, v in weibo.items():
            # if 'bool' not in str(type(v)) and 'int' not in str(
            #         type(v)) and 'list' not in str(
            #     type(v)) and 'long' not in str(type(v)):
            #     weibo[k] = v.replace(u'\u200b', '').encode(
            #         sys.stdout.encoding, 'ignore').decode(sys.stdout.encoding)
        return weibo

    def parse_weibo(self, weibo_info):
        weibo = OrderedDict()
        if weibo_info['user']:
            weibo['user_id'] = weibo_info['user']['id']
            weibo['screen_name'] = weibo_info['user']['screen_name']
        else:
            weibo['user_id'] = ''
            weibo['screen_name'] = ''
        weibo['id'] = int(weibo_info['id'])
        weibo['bid'] = weibo_info['bid']
        text_body = weibo_info['text']
        selector = etree.HTML(text_body)
        weibo['text'] = etree.HTML(text_body).xpath('string(.)')
        weibo['article_url'] = self.get_article_url(selector)
        weibo['pics'] = self.get_pics(weibo_info)
        weibo['video_url'] = self.get_video_url(weibo_info)
        weibo['location'] = self.get_location(selector)
        weibo['created_at'] = weibo_info['created_at']
        weibo['source'] = weibo_info['source']
        weibo['attitudes_count'] = self.string_to_int(
            weibo_info.get('attitudes_count', 0))
        weibo['comments_count'] = self.string_to_int(
            weibo_info.get('comments_count', 0))
        weibo['reposts_count'] = self.string_to_int(
            weibo_info.get('reposts_count', 0))
        weibo['topics'] = self.get_topics(selector)
        weibo['at_users'] = self.get_at_users(selector)
        return self.standardize_info(weibo)

    def print_user_info(self):
        """打印用户信息"""
        logger.info('+' * 100)
        self.user.print()
        logger.info('+' * 100)

    def print_one_weibo(self, weibo):
        """打印一条微博"""
        try:
            logger.info(u'微博id:{}', weibo['id'])
            logger.info(u'微博正文:{}', weibo['text'])
            logger.info(u'原始图片url:{}', weibo['pics'])
            logger.info(u'微博位置:{}', weibo['location'])
            logger.info(u'发布时间:{}', weibo['created_at'])
            logger.info(u'发布工具:{}', weibo['source'])
            logger.info(u'点赞数:{}', weibo['attitudes_count'])
            logger.info(u'评论数:{}', weibo['comments_count'])
            logger.info(u'转发数:{}', weibo['reposts_count'])
            logger.info(u'话题:{}', weibo['topics'])
            logger.info(u'@用户:{}', weibo['at_users'])
            logger.info(u'url:https://m.weibo.cn/detail/{}', weibo['id'])
        except OSError:
            pass

    def print_weibo(self, weibo):
        """打印微博,若为转发微博,会同时打印原创和转发部分"""
        if weibo.get('retweet'):
            logger.info('*' * 100)
            logger.info(u'转发部分:')
            self.print_one_weibo(weibo['retweet'])
            logger.info('*' * 100)
            logger.info(u'原创部分:')
        self.print_one_weibo(weibo)
        logger.info('-' * 120)

    def get_one_weibo(self, info):
        """获取一条微博的全部信息"""
        try:
            weibo_info = info['mblog']
            weibo_id = weibo_info['id']
            retweeted_status = weibo_info.get('retweeted_status')
            is_long = True if weibo_info.get(
                'pic_num') > 9 else weibo_info.get('isLongText')
            if retweeted_status and retweeted_status.get('id'):  # 转发
                retweet_id = retweeted_status.get('id')
                is_long_retweet = retweeted_status.get('isLongText')
                if is_long:
                    weibo = self.get_long_weibo(weibo_id)
                    if not weibo:
                        weibo = self.parse_weibo(weibo_info)
                else:
                    weibo = self.parse_weibo(weibo_info)
                if is_long_retweet:
                    retweet = self.get_long_weibo(retweet_id)
                    if not retweet:
                        retweet = self.parse_weibo(retweeted_status)
                else:
                    retweet = self.parse_weibo(retweeted_status)
                retweet['created_at'] = self.standardize_date(
                    retweeted_status['created_at'])
                weibo['retweet'] = retweet
            else:  # 原创
                if is_long:
                    weibo = self.get_long_weibo(weibo_id)
                    if not weibo:
                        weibo = self.parse_weibo(weibo_info)
                else:
                    weibo = self.parse_weibo(weibo_info)
            weibo['created_at'] = self.standardize_date(
                weibo_info['created_at'])
            return weibo
        except Exception as e:
            logger.exception(e)

    def is_pinned_weibo(self, info):
        """判断微博是否为置顶微博"""
        weibo_info = info['mblog']
        title = weibo_info.get('title')
        if title and title.get('text') == u'置顶':
            return True
        else:
            return False

    def get_one_page(self, page):
        """获取一页的全部微博"""
        try:
            js = self.get_weibo_json(page)
            if js['ok']:
                weibos = js['data']['cards']
                if self.query:
                    weibos = weibos[0]['card_group']
                for w in weibos:
                    if w['card_type'] == 9:
                        wb = self.get_one_weibo(w)
                        if wb:
                            if wb['id'] in self.weibo_id_list:
                                continue
                            created_at = datetime.strptime(
                                wb['created_at'], '%Y-%m-%d')
                            since_date = datetime.strptime(
                                self.user_config['since_date'], '%Y-%m-%d')
                            if created_at < since_date:
                                if self.is_pinned_weibo(w):
                                    continue
                                else:
                                    logger.info(
                                        u'{}已获取{}({})的第{}页{}微博{}'.format(
                                            '-' * 30, self.user.screen_name,
                                            self.user.user_id, page,
                                            '包含"' + self.query +
                                            '"的' if self.query else '',
                                            '-' * 30))
                                    return True
                            if (not self.filter) or (
                                    'retweet' not in wb.keys()):
                                hbase.check_create_table("weibo", {"info": {}, "img": {}, "video": {}})
                                hbase.update("weibo", str(wb['id']), {"info": WeiBoInfo(**wb).dict(), "img": {}, "video": {}})
                                self.weibo.append(wb)
                                self.weibo_id_list.append(wb['id'])
                                self.got_count += 1
                                self.print_weibo(wb)
                            else:
                                logger.info(u'正在过滤转发微博')
            else:
                return True
            logger.info(u'{}已获取{}({})的第{}页微博{}'.format(
                '-' * 30, self.user.screen_name, self.user.user_id, page,
                '-' * 30))
        except Exception as e:
            logger.exception(e)

    def get_page_count(self):
        """获取微博页数"""
        try:
            page_count = int(math.ceil(int(self.user.statuses_count) / 10.0))
            return page_count
        except KeyError:
            logger.exception(
                u'程序出错,错误原因可能为以下两者:\n'
                u'1.user_id不正确;\n'
                u'2.此用户微博可能需要设置cookie才能爬取。\n'
                u'解决方案:\n'
                u'请参考\n'
                u'https://github.com/dataabc/weibo-crawler#如何获取user_id\n'
                u'获取正确的user_id;\n'
                u'或者参考\n'
                u'https://github.com/dataabc/weibo-crawler#3程序设置\n'
                u'中的“设置cookie”部分设置cookie信息')

    def get_write_info(self, wrote_count):
        """获取要写入的微博信息"""
        write_info = []
        for w in self.weibo[wrote_count:]:
            wb = OrderedDict()
            for k, v in w.items():
                if k not in ['user_id', 'screen_name', 'retweet']:
                    if 'unicode' in str(type(v)):
                        v = v.encode('utf-8')
                    wb[k] = v
            if not self.filter:
                if w.get('retweet'):
                    wb['is_original'] = False
                    for k2, v2 in w['retweet'].items():
                        if 'unicode' in str(type(v2)):
                            v2 = v2.encode('utf-8')
                        wb['retweet_' + k2] = v2
                else:
                    wb['is_original'] = True
            write_info.append(wb)
        return write_info

    def get_filepath(self, type):
        """获取结果文件路径"""
        try:
            dir_name = self.user.screen_name
            if self.result_dir_name:
                dir_name = self.user_config['user_id']
            file_dir = os.path.split(os.path.realpath(
                __file__))[0] + os.sep + 'weibo' + os.sep + dir_name
            if type == 'img' or type == 'video':
                file_dir = file_dir + os.sep + type
            if not os.path.isdir(file_dir):
                os.makedirs(file_dir)
            if type == 'img' or type == 'video':
                return file_dir
            file_path = file_dir + os.sep + self.user_config[
                'user_id'] + '.' + type
            return file_path
        except Exception as e:
            logger.exception(e)

    def get_result_headers(self):
        """获取要写入结果文件的表头"""
        result_headers = [
            'id', 'bid', '正文', '头条文章url', '原始图片url', '视频url', '位置', '日期', '工具',
            '点赞数', '评论数', '转发数', '话题', '@用户'
        ]
        if not self.filter:
            result_headers2 = ['是否原创', '源用户id', '源用户昵称']
            result_headers3 = ['源微博' + r for r in result_headers]
            result_headers = result_headers + result_headers2 + result_headers3
        return result_headers

    def write_csv(self, wrote_count):
        """将爬到的信息写入csv文件"""
        write_info = self.get_write_info(wrote_count)
        result_headers = self.get_result_headers()
        result_data = [w.values() for w in write_info]
        file_path = self.get_filepath('csv')
        self.csv_helper(result_headers, result_data, file_path)

    def csv_helper(self, headers, result_data, file_path):
        """将指定信息写入csv文件"""
        if not os.path.isfile(file_path):
            is_first_write = 1
        else:
            is_first_write = 0
        if sys.version < '3':  # python2.x
            with open(file_path, 'ab') as f:
                f.write(codecs.BOM_UTF8)
                writer = csv.writer(f)
                if is_first_write:
                    writer.writerows([headers])
                writer.writerows(result_data)
        else:  # python3.x
            with open(file_path, 'a', encoding='utf-8-sig', newline='') as f:
                writer = csv.writer(f)
                if is_first_write:
                    writer.writerows([headers])
                writer.writerows(result_data)
        if headers[0] == 'id':
            logger.info(u'%d条微博写入csv文件完毕,保存路径:', self.got_count)
        else:
            logger.info(u'%s 信息写入csv文件完毕,保存路径:', self.user.screen_name)
        logger.info(file_path)

    def update_json_data(self, data, weibo_info):
        """更新要写入json结果文件中的数据,已经存在于json中的信息更新为最新值,不存在的信息添加到data中"""
        data['user'] = self.user
        if data.get('weibo'):
            is_new = 1  # 待写入微博是否全部为新微博,即待写入微博与json中的数据不重复
            for old in data['weibo']:
                if weibo_info[-1]['id'] == old['id']:
                    is_new = 0
                    break
            if is_new == 0:
                for new in weibo_info:
                    flag = 1
                    for i, old in enumerate(data['weibo']):
                        if new['id'] == old['id']:
                            data['weibo'][i] = new
                            flag = 0
                            break
                    if flag:
                        data['weibo'].append(new)
            else:
                data['weibo'] += weibo_info
        else:
            data['weibo'] = weibo_info
        return data

    def write_json(self, wrote_count):
        """将爬到的信息写入json文件"""
        data = {}
        path = self.get_filepath('json')
        if os.path.isfile(path):
            with codecs.open(path, 'r', encoding='utf-8') as f:
                data = json.load(f)
        weibo_info = self.weibo[wrote_count:]
        data = self.update_json_data(data, weibo_info)
        with codecs.open(path, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False)
        logger.info(u'%d条微博写入json文件完毕,保存路径:', self.got_count)
        logger.info(path)

    def info_to_mongodb(self, collection, info_list):
        """将爬取的信息写入MongoDB数据库"""
        try:
            import pymongo
        except ImportError:
            logger.warning(
                u'系统中可能没有安装pymongo库,请先运行 pip install pymongo ,再运行程序')
            sys.exit()
        try:
            from pymongo import MongoClient

            client = MongoClient()
            db = client['weibo']
            collection = db[collection]
            if len(self.write_mode) > 1:
                new_info_list = copy.deepcopy(info_list)
            else:
                new_info_list = info_list
            for info in new_info_list:
                if not collection.find_one({'id': info['id']}):
                    collection.insert_one(info)
                else:
                    collection.update_one({'id': info['id']}, {'$set': info})
        except pymongo.errors.ServerSelectionTimeoutError:
            logger.warning(
                u'系统中可能没有安装或启动MongoDB数据库,请先根据系统环境安装或启动MongoDB,再运行程序')
            sys.exit()

    def weibo_to_mongodb(self, wrote_count):
        """将爬取的微博信息写入MongoDB数据库"""
        self.info_to_mongodb('weibo', self.weibo[wrote_count:])
        logger.info(u'%d条微博写入MongoDB数据库完毕', self.got_count)

    def mysql_create(self, connection, sql):
        """创建MySQL数据库或表"""
        try:
            with connection.cursor() as cursor:
                cursor.execute(sql)
        finally:
            connection.close()

    def mysql_create_database(self, mysql_config, sql):
        """创建MySQL数据库"""
        try:
            import pymysql
        except ImportError:
            logger.warning(
                u'系统中可能没有安装pymysql库,请先运行 pip install pymysql ,再运行程序')
            sys.exit()
        try:
            if self.mysql_config:
                mysql_config = self.mysql_config
            connection = pymysql.connect(**mysql_config)
            self.mysql_create(connection, sql)
        except pymysql.OperationalError:
            logger.warning(u'系统中可能没有安装或正确配置MySQL数据库,请先根据系统环境安装或配置MySQL,再运行程序')
            sys.exit()

    def mysql_create_table(self, mysql_config, sql):
        """创建MySQL表"""
        import pymysql

        if self.mysql_config:
            mysql_config = self.mysql_config
        mysql_config['db'] = 'weibo'
        connection = pymysql.connect(**mysql_config)
        self.mysql_create(connection, sql)

    def mysql_insert(self, mysql_config, table, data_list):
        """向MySQL表插入或更新数据"""
        import pymysql

        if len(data_list) > 0:
            keys = ', '.join(data_list[0].keys())
            values = ', '.join(['%s'] * len(data_list[0]))
            if self.mysql_config:
                mysql_config = self.mysql_config
            mysql_config['db'] = 'weibo'
            connection = pymysql.connect(**mysql_config)
            cursor = connection.cursor()
            sql = """INSERT INTO {table}({keys}) VALUES ({values}) ON
                     DUPLICATE KEY UPDATE""".format(table=table,
                                                    keys=keys,
                                                    values=values)
            update = ','.join([
                ' {key} = values({key})'.format(key=key)
                for key in data_list[0]
            ])
            sql += update
            try:
                cursor.executemany(
                    sql, [tuple(data.values()) for data in data_list])
                connection.commit()
            except Exception as e:
                connection.rollback()
                logger.exception(e)
            finally:
                connection.close()

    def weibo_to_mysql(self, wrote_count):
        """将爬取的微博信息写入MySQL数据库"""
        mysql_config = {
            'host': 'localhost',
            'port': 3306,
            'user': '******',
            'password': '******',
            'charset': 'utf8mb4'
        }
        # 创建'weibo'表
        create_table = """
                CREATE TABLE IF NOT EXISTS weibo (
                id varchar(20) NOT NULL,
                bid varchar(12) NOT NULL,
                user_id varchar(20),
                screen_name varchar(30),
                text varchar(2000),
                article_url varchar(100),
                topics varchar(200),
                at_users varchar(1000),
                pics varchar(3000),
                video_url varchar(1000),
                location varchar(100),
                created_at DATETIME,
                source varchar(30),
                attitudes_count INT,
                comments_count INT,
                reposts_count INT,
                retweet_id varchar(20),
                PRIMARY KEY (id)
                ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"""
        self.mysql_create_table(mysql_config, create_table)
        weibo_list = []
        retweet_list = []
        if len(self.write_mode) > 1:
            info_list = copy.deepcopy(self.weibo[wrote_count:])
        else:
            info_list = self.weibo[wrote_count:]
        for w in info_list:
            if 'retweet' in w:
                w['retweet']['retweet_id'] = ''
                retweet_list.append(w['retweet'])
                w['retweet_id'] = w['retweet']['id']
                del w['retweet']
            else:
                w['retweet_id'] = ''
            weibo_list.append(w)
        # 在'weibo'表中插入或更新微博数据
        self.mysql_insert(mysql_config, 'weibo', retweet_list)
        self.mysql_insert(mysql_config, 'weibo', weibo_list)
        logger.info(u'%d条微博写入MySQL数据库完毕', self.got_count)

    def update_user_config_file(self, user_config_file_path):
        """更新用户配置文件"""
        with open(user_config_file_path, 'rb') as f:
            try:
                lines = f.read().splitlines()
                lines = [line.decode('utf-8-sig') for line in lines]
            except UnicodeDecodeError:
                logger.error(u'%s文件应为utf-8编码,请先将文件编码转为utf-8再运行程序',
                             user_config_file_path)
                sys.exit()
            for i, line in enumerate(lines):
                info = line.split(' ')
                if len(info) > 0 and info[0].isdigit():
                    if self.user_config['user_id'] == info[0]:
                        if len(info) == 1:
                            info.append(self.user.screen_name)
                            info.append(self.start_date)
                        if len(info) == 2:
                            info.append(self.start_date)
                        if len(info) > 2:
                            info[2] = self.start_date
                        lines[i] = ' '.join(info)
                        break
        with codecs.open(user_config_file_path, 'w', encoding='utf-8') as f:
            f.write('\n'.join(lines))

    def write_data(self, wrote_count):
        """将爬到的信息写入文件或数据库"""
        if self.got_count > wrote_count:
            if 'csv' in self.write_mode:
                self.write_csv(wrote_count)
            if 'json' in self.write_mode:
                self.write_json(wrote_count)
            if 'mysql' in self.write_mode:
                self.weibo_to_mysql(wrote_count)
            if 'mongo' in self.write_mode:
                self.weibo_to_mongodb(wrote_count)
            if self.original_pic_download:
                self.download_files('img', 'original', wrote_count)
            if self.original_video_download:
                self.download_files('video', 'original', wrote_count)
            if not self.filter:
                if self.retweet_pic_download:
                    self.download_files('img', 'retweet', wrote_count)
                if self.retweet_video_download:
                    self.download_files('video', 'retweet', wrote_count)

    def get_pages(self):
        """获取全部微博"""
        try:
            self.get_user_info()
            self.print_user_info()
            since_date = datetime.strptime(self.user_config['since_date'],
                                           '%Y-%m-%d')
            today = datetime.strptime(str(date.today()), '%Y-%m-%d')
            if since_date <= today:
                page_count = self.get_page_count()
                wrote_count = 0
                page1 = 0
                random_pages = random.randint(1, 5)
                self.start_date = datetime.now().strftime('%Y-%m-%d')
                for page in range(self.start_page, page_count + 1):
                    is_end = self.get_one_page(page)
                    if is_end:
                        break

                    if page % 20 == 0:  # 每爬20页写入一次文件
                        self.write_data(wrote_count)
                        wrote_count = self.got_count

                    # 通过加入随机等待避免被限制。爬虫速度过快容易被系统限制(一段时间后限
                    # 制会自动解除),加入随机等待模拟人的操作,可降低被系统限制的风险。默
                    # 认是每爬取1到5页随机等待6到10秒,如果仍然被限,可适当增加sleep时间
                    if (page -
                        page1) % random_pages == 0 and page < page_count:
                        sleep(random.randint(6, 10))
                        page1 = page
                        random_pages = random.randint(1, 5)

                self.write_data(wrote_count)  # 将剩余不足20页的微博写入文件
            logger.info(u'微博爬取完成,共爬取%d条微博', self.got_count)
        except Exception as e:
            logger.exception(e)

    def get_user_config_list(self, file_path):
        """获取文件中的微博id信息"""
        with open(file_path, 'rb') as f:
            try:
                lines = f.read().splitlines()
                lines = [line.decode('utf-8-sig') for line in lines]
            except UnicodeDecodeError:
                logger.error(u'%s文件应为utf-8编码,请先将文件编码转为utf-8再运行程序', file_path)
                sys.exit()
            user_config_list = []
            for line in lines:
                info = line.split(' ')
                if len(info) > 0 and info[0].isdigit():
                    user_config = {}
                    user_config['user_id'] = info[0]
                    if len(info) > 2:
                        if self.is_date(info[2]):
                            user_config['since_date'] = info[2]
                        elif info[2].isdigit():
                            since_date = date.today() - timedelta(int(info[2]))
                            user_config['since_date'] = str(since_date)
                    else:
                        user_config['since_date'] = self.since_date
                    if len(info) > 3:
                        user_config['query_list'] = info[3].split(',')
                    else:
                        user_config['query_list'] = self.query_list
                    if user_config not in user_config_list:
                        user_config_list.append(user_config)
        return user_config_list

    def initialize_info(self, user_config):
        """初始化爬虫信息"""
        self.weibo = []
        self.user = UserInfo()
        self.user_config = user_config
        self.got_count = 0
        self.weibo_id_list = []

    def start(self):
        """运行爬虫"""
        for user_config in self.user_config_list:
            if len(user_config['query_list']):
                for query in user_config['query_list']:
                    self.query = query
                    self.initialize_info(user_config)
                    self.get_pages()
            else:
                self.initialize_info(user_config)
                self.get_pages()
            logger.info(u'信息抓取完毕')
            logger.info('*' * 100)
            if self.user_config_file_path and self.user.user_id:
                self.update_user_config_file(self.user_config_file_path)
Exemplo n.º 27
0
def get_user_info():
    u = get_current_user()
    if u is None:
        return None
    else:
        return UserInfo.get_or_insert(key_name='user:%s' % u.email())
Exemplo n.º 28
0
def get_user_info():
  u = get_current_user()
  if u is None:
    return None
  else:
    return UserInfo.get_or_insert(key_name='user:%s' % u.email())
Exemplo n.º 29
0
 def fetchUserInfo(self, user_id):
     ## *****Add attempt to get from session or memcache*****
     request = UserInfo.all().filter("fs_id = ", str(user_id))
     user = request.get()
     return user if user else None
Exemplo n.º 30
0
 def fetchAccessToken(self, user_id):
     ## Add attempt to get from session or memcache
     request = UserInfo.all()
     request.filter("fs_id = ", str(user_id))
     user_token = request.get()
     return user_token.token if user_token else None