Python query示例，jquery.query Python示例

示例#1

0

显示文件

文件： login.py 项目： konjac/renren-friends-information

def open(email, password):

    cookie = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
    urllib2.install_opener(opener)

    html = urllib2.urlopen('http://m.renren.com').read()
    
    img, verifykey = None, None 
    xn = {}
    xn['email'] = email
    xn['password'] = password
    try:
        img = jquery.query(html, "img")[1].attrib['src'];
        verifykey = jquery.query(html, "input[name=verifykey]")[0].attrib['value']
        print img
        verifycode = raw_input("please visit the above url in your browser, and type the verify code on the page:\n")
        xn['verifycode'] = verifycode
        xn['verifykey'] = verifykey
    except:
        pass

    #print xn
    
    data = urllib.urlencode(xn)
    req = urllib2.Request('http://3g.renren.com/login.do?fx=0&autoLogin=true', data)
    resp = urllib2.urlopen(req)
    return resp.read()

示例#2

0

显示文件

文件： detail_spider.py 项目： yanshengli/renren_craweller

def fetch_details(filepath,outputfile):
    #path='UserData/'
    files=os.listdir(filepath)
    print files
    number=0
    for file in files:#有 许多的文件在里面 。
        f=open(filepath+file,'r')
        person_cnt=0
        filedata=f.readlines()
        no=0
        for data in filedata: #一 个 文件作为，每个文件里面包含诜多个链接，每个链接都可有一个用户。
##############################################得到组信息################################
            pattern1='id=\d+'
            filename=re.search(pattern1,data).group()
            f_detail=open(outputfile+filename,'w+')
            f_detail.write('[')
            print data
            html=urllib2.urlopen(data).read()
#################################得到详细资料##########################################################################
            url = data.replace("profile", "details")
            uid = re.findall(r"id=([0-9]+)", url)[0]
            detail_page=urllib2.urlopen(url).read()
            time.sleep(2)
            name = jquery.query(detail_page, ".sec b a").text()
            res = jquery.query(detail_page, ".list")
            info = "\n".join(res.listOuterHtml());
            if len(info) == 0:
                info = u"null"
            if person_cnt>0:
                f_detail.write(",");
            f_detail.write("{\n");
            f_detail.write(("\t\"%s\": %d")%("uid", int(uid)))
            f_detail.write((",\n\t\"%s\": \"%s\"")%("Name", util.utf8_wrapper(name)))
            for s in re.findall(ur"[^>]*：[^<]*", info):
                idx = s.find(u"：");
                f_detail.write(util.utf8_wrapper((",\n\t\"%s\": \"%s\"")%(s[0:idx], s[idx+1:])))
            #f_detail.write(util.utf8_wrapper((",\n\t\"%s\": \"%s\"")%("publicpage", publicindexs)))
            time.sleep(1)
            f_detail.write("\n}")
            f_detail.flush()
            person_cnt=person_cnt+1
            f_detail.write('\n'+']')
            f_detail.close()
            f.close()
            number=number+1
        print "已分析数据 "+str(number)+'个'

示例#3

0

显示文件

文件： get_friendslist.py 项目： yanshengli/renren_craweller

def get_friendlist(file_input,lay2_file_path):
    files=os.listdir(file_input)

    sum_file=os.listdir('lay2/edge/')
    for file in files:#读取文件名
        print file
        input_filedata=open(file_input+file)
        pattern='详细资料.*(他|她)的好友'
        myfriendlist=""

        for data in input_filedata.readlines():#读取文件内容,也就市链接
            print data
            myfriendlist=""
            pattern1='id=\d+'
            filename=re.search(pattern1,data).group()
            if filename[3:] in sum_file:
                continue
            html = urllib2.urlopen(data,timeout=15).read()
            #time.sleep(2)
            if re.search('>关注者<',html)!=None:
                print '公共主页'
                time.sleep(10)
                continue
            #print html
            url=re.search(pattern,html).group()#得到好友的首页 url
            index1=url.find('href=')#找到好友的好友列表.
            buff=url[index1+6:-14]
            buff=re.sub('amp;','',buff)
            buff=re.sub('f=same','f=all',buff)#不加这句的话就是变成求 共同好友。
            #print buff
            html=urllib2.urlopen(buff,timeout=15).read()#打开好友的好友的列表
            time.sleep(2)
    ##############################################读取所有的好友#############################################
            while True:
                profiles = map(lambda t : t.attrib['href'], jquery.query(html, "a[class=p]"))
                for p in profiles:
                    print p
                    myfriendlist=myfriendlist+p+"\n"
                if html.find("下一页") == -1:
                    break
                next_page = jquery.query(html, u"[title=下一页]")[0].attrib['href']
                #print next_page
                html = urllib2.urlopen(next_page,timeout=15).read()
                time.sleep(2)
            save_lay1friend_file(myfriendlist,lay2_file_path+filename[3:])

示例#4

0

显示文件

文件： get_friendslist.py 项目： yanshengli/renren_craweller

def get_myfriendlist(output_file,myid):
    f = open(output_file, 'w+')
    html = urllib2.urlopen('http://3g.renren.com/friendlist.do').read()
    page_cnt = 0
    person_cnt = 0
    #f.write("[")
    myfriendlist=""
    while True:
        profiles = map(lambda t : t.attrib['href'], jquery.query(html, "a[class=p]"))
        for p in profiles:
            print p
            myfriendlist=myfriendlist+p+"\n"
            person_cnt = person_cnt + 1

        if html.find("下一页") == -1:
            break
        next_page = jquery.query(html, u"[title=下一页]")[0].attrib['href']
        print next_page
        html = urllib2.urlopen(next_page).read()
        page_cnt = page_cnt + 1
        time.sleep(2)
    f.write(myfriendlist)
    f.write("\n")
    f.close()

示例#5

0

显示文件

文件： main.py 项目： konjac/renren-friends-information

email = config.get("account", "email")
password = config.get("account", "password")
output_file = config.get("output", "filename")

print email, password, output_file

f = open(output_file, 'w')

login.open(email, password)

html = urllib2.urlopen('http://3g.renren.com/friendlist.do').read()
page_cnt = 0
person_cnt = 0
f.write("[");
while True:
    profiles = map(lambda t : t.attrib['href'], jquery.query(html, "a[class=p]"))
    for p in profiles:
        url = p.replace("profile", "details")
        uid = re.findall(r"id=([0-9]+)", url)[0]
        print uid, person_cnt
        detail_page = urllib2.urlopen(url).read()
        name = jquery.query(detail_page, ".sec b a").text()
        res = jquery.query(detail_page, ".list")
        info = "\n".join(res.listOuterHtml());
        if len(info) == 0:
            info = u"null"
        if person_cnt>0:
            f.write(",");
        f.write("{\n");
        f.write(("\t\"%s\": %d")%("uid", int(uid)))
        f.write((",\n\t\"%s\": \"%s\"")%("Name", util.utf8_wrapper(name)))