Пример #1
0
def main(use_proxies=False):
    conf, engine = Connect('conf.yaml')  # 获取配置文件的内容
    uids = conf.get('uids')
    cookies = conf.get('cookies')
    user_agents = conf.get('user_agents')
    uids = list(uids.values())
    cookies = list(cookies.values())
    user_agents = list(user_agents.values())

    conn = engine.connect()
    metadata = MetaData(engine)
    wb_user = Table('wb_user', metadata,
                    autoload=True)  # Table Reflection 个人信息表
    wb_data = Table('wb_data', metadata, autoload=True)  # 动态表

    for uid in uids:
        # 随机选择,防止被ban
        cookie = random.choice(cookies)
        cookie = getcookies(cookie)
        headers = {'User_Agent': random.choice(user_agents)}
        infourl = 'https://weibo.cn/' + str(uid) + '/info'  #资料页面
        mainurl = 'https://weibo.cn/' + str(uid)  #动态页面
        resinfo = gethtml(infourl, headers, cookie, conf,
                          use_proxies)  # 抓取资料页的信息
        resmain = gethtml(mainurl, headers, cookie, conf,
                          use_proxies)  # 抓取用户主页信息
        getinfo(resinfo, uid, wb_user, conn)
        getmain(resmain, uid, wb_data, conn, mainurl, user_agents, cookies,
                conf, use_proxies)

    conn.close()
Пример #2
0
def main():
    conf, engine = Connect('conf.yaml')  # 获取配置文件的内容
    uids = conf.get('uids')
    uids = list(uids.values())
    cookies = pickle.load(open('cookies.pkl', 'rb'))
    conn = engine.connect()
    metadata = MetaData(engine)
    wb_user = Table('wb_user', metadata,
                    autoload=True)  # Table Reflection 个人信息表
    wb_data = Table('wb_data', metadata, autoload=True)  # 动态表
    for uid in uids:
        getmain(cookies, uid, conn, wb_data, wb_user)
    conn.close()
Пример #3
0
def get():
    conf, engine = Connect('conf.yaml')  # 获取配置文件的内容
    loginname = conf.get('loginname')
    password = conf.get('password')

    loginname = list(loginname.values())
    password = list(password.values())
    with open('cookies.pkl', 'wb') as f:
        for i in range(len(password)):  # 将每个账号的cookies保存下来.
            try:
                driver = webdriver.Chrome()
                driver.set_window_size(
                    1124, 850)  # 防止得到的WebElement的状态is_displayed为False,即不可见
                driver.get("http://www.weibo.com/login.php")
                time.sleep(5)
                driver.find_element_by_xpath('//*[@id="loginname"]').clear()
                driver.find_element_by_xpath('//*[@id="loginname"]').send_keys(
                    loginname[i])
                driver.find_element_by_xpath(
                    '//*[@id="pl_login_form"]/div/div[3]/div[2]/div/input'
                ).clear()

                time.sleep(2)
                driver.find_element_by_xpath(
                    '//*[@id="pl_login_form"]/div/div[3]/div[2]/div/input'
                ).send_keys(password[i])
                driver.find_element_by_xpath(
                    '//*[@id="pl_login_form"]/div/div[3]/div[6]/a').click()
                driver.find_element_by_xpath(
                    '//*[@id="pl_login_form"]/div/div[3]/div[3]/div/input'
                ).send_keys(input("输入验证码: "))

                time.sleep(1)
                driver.find_element_by_xpath(
                    '//*[@id="pl_login_form"]/div/div[3]/div[6]/a').click()
            except Exception as e:
                print("验证码输入错误,请重新输入!")
                driver.find_element_by_xpath(
                    '//*[@id="pl_login_form"]/div/div[3]/div[3]/div/input'
                ).send_keys(input("输入验证码: "))
                time.sleep(1)
                driver.find_element_by_xpath(
                    '//*[@id="pl_login_form"]/div/div[3]/div[6]/a').click()
            cookies = driver.get_cookies()
            print(cookies)
            pickle.dump(cookies, f)
Пример #4
0
def get_time_str(uid):
    _, engine = Connect('../conf.yaml')  # 连接数据库
    conn = engine.connect()
    metadata = MetaData(engine)
    wb_data = Table('wb_data', metadata, autoload=True)
    s = select([wb_data]).where(wb_data.c.uid == uid)
    res = conn.execute(s)
    conn.close()
    str = ''
    time_lists = []
    for row in res:
        str += row[2] + '\n'
        time_lists.append(row[3])
    return time_lists, str
Пример #5
0
def DeleteUsers():
    conf, engine = Connect('conf.yaml')
    conn = engine.connect()
    metadata = MetaData(engine)
    wb_data = Table('wb_data', metadata, autoload=True)
    wb_user = Table('wb_user', metadata, autoload=True)
    wb_topic = Table('wb_topic', metadata, autoload=True)
    empty = select([wb_user.c.uid])
    res = conn.execute(empty)  #得到WBUser表中所有的uid
    deluid = []  #要删除的uid
    uids = conf.get('uids')
    uids = list(uids.values())  #得到配置文件中的uid
    for r in res:
        if (int(r[0]) not in uids):
            deluid.append(r[0])
    for uid in deluid:
        exc = wb_data.delete().where(wb_user.c.uid == str(uid))  #删除用户动态信息
        conn.execute(exc)
        exc = wb_topic.delete().where(wb_topic.c.uid == str(uid))  #删除用户主题
        conn.execute(exc)
        exc = wb_user.delete().where(wb_user.c.uid == str(uid))  #删除用户个人信息
        conn.execute(exc)

    conn.close()
Пример #6
0
def Save_Topic_Words(model,feature_names, uid,n_top_words=20):
    _,engine=Connect('../conf.yaml')
    conn = engine.connect()
    metadata = MetaData(engine)
    wb_topic = Table('wb_topic', metadata, autoload=True)
    for topic_idx, topic in enumerate(model.components_):
        topics=topic_idx                                            #主题
        topic_conts=([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]])#主题
        print("Topic #%d:" % topics)
        print(topic_conts)
        for topic_cont in topic_conts:
            ins = insert(wb_topic).values(uid=uid,topic=topics,topic_cont=topic_cont)
            ins = ins.on_duplicate_key_update(
                topic=topics
            )
            conn.execute(ins)

    conn.close()
Пример #7
0
    lda = LatentDirichletAllocation(n_components=topics,#主题数
                                    learning_method='batch',#样本量不大只是用来学习的话用"batch"比较好,这样可以少很多参数要调
                                    )
    #用变分贝叶斯方法训练模型
    lda.fit(tf)

    #依次输出每个主题的关键词表
    tf_feature_names = tf_vectorizer.get_feature_names()

    return lda,tf,tf_feature_names,tf_vectorizer

#将主题以可视化结果展现出来
def pyLDAvisUI(lda,tf,tf_vectorizer):

    page = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)
    pyLDAvis.save_html(page, 'lda.html')        #将主题可视化数据保存为html文件
    #pyLDAvis.save_json(page,'lda.json')         #将主题可视化数据保存为json文件


def main(uid):
    wordlists, uid = getwords(uid)
    lda, tf, tf_feature_names, tf_vectorizer = word2vec(wordlists)
    Save_Topic_Words(lda, tf_feature_names, uid)
    pyLDAvisUI(lda, tf, tf_vectorizer)

if __name__ == '__main__':
    conf, _ = Connect('../conf.yaml')
    uid = conf.get('uids')
    uid = list(uid.values())[0]
    main(uid)  # 指定需要分析的用户的uid(必须先存在conf.yaml里面,并且运行了一次sina_spider程序),默认为conf.yaml中的第一条uid