def open(email, password): cookie = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie)) urllib2.install_opener(opener) html = urllib2.urlopen('http://m.renren.com').read() img, verifykey = None, None xn = {} xn['email'] = email xn['password'] = password try: img = jquery.query(html, "img")[1].attrib['src']; verifykey = jquery.query(html, "input[name=verifykey]")[0].attrib['value'] print img verifycode = raw_input("please visit the above url in your browser, and type the verify code on the page:\n") xn['verifycode'] = verifycode xn['verifykey'] = verifykey except: pass #print xn data = urllib.urlencode(xn) req = urllib2.Request('http://3g.renren.com/login.do?fx=0&autoLogin=true', data) resp = urllib2.urlopen(req) return resp.read()
def fetch_details(filepath,outputfile): #path='UserData/' files=os.listdir(filepath) print files number=0 for file in files:#有 许多的文件在里面 。 f=open(filepath+file,'r') person_cnt=0 filedata=f.readlines() no=0 for data in filedata: #一 个 文件作为,每个文件里面包含诜多个链接,每个链接都可有一个用户。 ##############################################得到组信息################################ pattern1='id=\d+' filename=re.search(pattern1,data).group() f_detail=open(outputfile+filename,'w+') f_detail.write('[') print data html=urllib2.urlopen(data).read() #################################得到详细资料########################################################################## url = data.replace("profile", "details") uid = re.findall(r"id=([0-9]+)", url)[0] detail_page=urllib2.urlopen(url).read() time.sleep(2) name = jquery.query(detail_page, ".sec b a").text() res = jquery.query(detail_page, ".list") info = "\n".join(res.listOuterHtml()); if len(info) == 0: info = u"null" if person_cnt>0: f_detail.write(","); f_detail.write("{\n"); f_detail.write(("\t\"%s\": %d")%("uid", int(uid))) f_detail.write((",\n\t\"%s\": \"%s\"")%("Name", util.utf8_wrapper(name))) for s in re.findall(ur"[^>]*:[^<]*", info): idx = s.find(u":"); f_detail.write(util.utf8_wrapper((",\n\t\"%s\": \"%s\"")%(s[0:idx], s[idx+1:]))) #f_detail.write(util.utf8_wrapper((",\n\t\"%s\": \"%s\"")%("publicpage", publicindexs))) time.sleep(1) f_detail.write("\n}") f_detail.flush() person_cnt=person_cnt+1 f_detail.write('\n'+']') f_detail.close() f.close() number=number+1 print "已分析数据 "+str(number)+'个'
def get_friendlist(file_input,lay2_file_path): files=os.listdir(file_input) sum_file=os.listdir('lay2/edge/') for file in files:#读取文件名 print file input_filedata=open(file_input+file) pattern='详细资料.*(他|她)的好友' myfriendlist="" for data in input_filedata.readlines():#读取文件内容,也就市链接 print data myfriendlist="" pattern1='id=\d+' filename=re.search(pattern1,data).group() if filename[3:] in sum_file: continue html = urllib2.urlopen(data,timeout=15).read() #time.sleep(2) if re.search('>关注者<',html)!=None: print '公共主页' time.sleep(10) continue #print html url=re.search(pattern,html).group()#得到好友的首页 url index1=url.find('href=')#找到好友的好友列表. buff=url[index1+6:-14] buff=re.sub('amp;','',buff) buff=re.sub('f=same','f=all',buff)#不加这句的话就是变成求 共同好友。 #print buff html=urllib2.urlopen(buff,timeout=15).read()#打开好友的好友的列表 time.sleep(2) ##############################################读取所有的好友############################################# while True: profiles = map(lambda t : t.attrib['href'], jquery.query(html, "a[class=p]")) for p in profiles: print p myfriendlist=myfriendlist+p+"\n" if html.find("下一页") == -1: break next_page = jquery.query(html, u"[title=下一页]")[0].attrib['href'] #print next_page html = urllib2.urlopen(next_page,timeout=15).read() time.sleep(2) save_lay1friend_file(myfriendlist,lay2_file_path+filename[3:])
def get_myfriendlist(output_file,myid): f = open(output_file, 'w+') html = urllib2.urlopen('http://3g.renren.com/friendlist.do').read() page_cnt = 0 person_cnt = 0 #f.write("[") myfriendlist="" while True: profiles = map(lambda t : t.attrib['href'], jquery.query(html, "a[class=p]")) for p in profiles: print p myfriendlist=myfriendlist+p+"\n" person_cnt = person_cnt + 1 if html.find("下一页") == -1: break next_page = jquery.query(html, u"[title=下一页]")[0].attrib['href'] print next_page html = urllib2.urlopen(next_page).read() page_cnt = page_cnt + 1 time.sleep(2) f.write(myfriendlist) f.write("\n") f.close()
email = config.get("account", "email") password = config.get("account", "password") output_file = config.get("output", "filename") print email, password, output_file f = open(output_file, 'w') login.open(email, password) html = urllib2.urlopen('http://3g.renren.com/friendlist.do').read() page_cnt = 0 person_cnt = 0 f.write("["); while True: profiles = map(lambda t : t.attrib['href'], jquery.query(html, "a[class=p]")) for p in profiles: url = p.replace("profile", "details") uid = re.findall(r"id=([0-9]+)", url)[0] print uid, person_cnt detail_page = urllib2.urlopen(url).read() name = jquery.query(detail_page, ".sec b a").text() res = jquery.query(detail_page, ".list") info = "\n".join(res.listOuterHtml()); if len(info) == 0: info = u"null" if person_cnt>0: f.write(","); f.write("{\n"); f.write(("\t\"%s\": %d")%("uid", int(uid))) f.write((",\n\t\"%s\": \"%s\"")%("Name", util.utf8_wrapper(name)))