def __getLocation(self,name): payload = { "okc_api" : 1, "func" : "query", "query" : name } payloadStr = urllib.urlencode(payload) page = SessionManager.getSession().get("http://www.okcupid.com/locquery?%s" % payloadStr) data = json.loads(page.text) return data["locid"]
def __fillUserProfile(self): payload = { "okc_api" : 1 } payloadStr = urllib.urlencode(payload) page = SessionManager.getSession().get("http://www.okcupid.com/profile/%s?%s" % (self.getUserName(),payloadStr)) data = json.loads(page.text) self.__config.set("User","Age","%s" % data["age"])
def doSearchJSON(url): """ TODO - this currently does one search to many, but we can not depend on the result numbers since blocked profiles are not returned. So gotta make that more effient. """ session = SessionManager.getSession() pageSize = 200 rv = [] i = 0 timeKey = 1 while True: newURL = url + "&timekey=%s&count=%s&low=%s&okc_api=1" % (timeKey,pageSize,(1+i*pageSize)) if i == 0: newURL += "#Search" try: print newURL page = session.get(newURL) except requests.exceptions.ConnectionError: logging.warn("Connection error, sleeping 30 seconds") time.sleep(30) continue """ print page print page.text print page.status_code print page.reason """ if page.status_code != 200: logging.warn("Page Error [%s:%s] sleeping 60 seconds" % (page.status_code,page.reason)) time.sleep(30) continue data = json.loads(page.text) data["url"] = newURL logging.info("Search total_matches [%s] matches [%s]" % (data["total_matches"],len(data["amateur_results"]))) if len(data["amateur_results"]) == 0: total = 0 for v in rv: total += len(v["amateur_results"]) logging.info("\tTotal [%s]" % total) return rv rv.append(data) timeKey = data["cache_timekey"] i+=1 time.sleep(10)
def crawlProfiles(self,names): count = 0 idx = 0 session = SessionManager.getSession() while True: if idx >= len(names): return if count >= self.getMaxSample(): return name = names[idx] logging.info("[%s]" % name) count += 1 #----------------------------------------------------------------------------------------- url = "http://www.okcupid.com/profile/%s?okc_api=1" % name try: logging.info(url) page = session.get(url) except requests.exceptions.ConnectionError: logging.warn("Connection error, sleeping 30 seconds") time.sleep(30) continue if page.status_code != 200: logging.warn("Page Error [%s:%s] sleeping 60 seconds" % (page.status_code,page.reason)) time.sleep(30) continue data = json.loads(page.text) idx += 1 if int(data["status"]) > 100: logging.warn("Profile returned status [%s:%s]" % (data["status"],data["status_str"])) continue userId = data["userid"] fileName = os.path.join(self.__profilePath,"%s.json" % userId) with open(fileName,'wb') as fp: json.dump(data,fp,indent=4, separators=(',', ': ')) #----------------------------------------------------------------------------------------- url = "http://www.okcupid.com/profile/%s" % name try: logging.info(url) page = session.get(url) except requests.exceptions.ConnectionError: logging.warn("Connection error, sleeping 30 seconds") time.sleep(30) continue if page.status_code != 200: logging.warn("Page Error [%s:%s] sleeping 60 seconds" % (page.status_code,page.reason)) time.sleep(30) continue fileName = os.path.join(self.__profilePath,"%s.html" % userId) with open(fileName,'wb') as fp: fp.write(page.text.encode("UTF-8")) #----------------------------------------------------------------------------------------- low = 1 while True: url = "http://www.okcupid.com/profile/%s/questions?okc_api=1&low=%d" % (name,low) result = json.loads(session.get(url).text) try: logging.info(url) page = session.get(url) except requests.exceptions.ConnectionError: logging.warn("Connection error, sleeping 30 seconds") time.sleep(30) continue if page.status_code != 200: logging.warn("Page Error [%s:%s] sleeping 60 seconds" % (page.status_code,page.reason)) time.sleep(30) continue data = json.loads(page.text) fileName = os.path.join(self.__answerPath,"%s.%s.json" % (userId,low)) with open(fileName,'wb') as fp: json.dump(data,fp,indent=4, separators=(',', ': ')) if data["pagination"]["cur_last"] == data["pagination"]["last"]: break else: low += 10 time.sleep(2) time.sleep(10)