if beg != -1: beg += 22 end = raw.index("</span>", beg) else: beg = raw.find('<div id="content">') if beg != -1: beg += 18 end = raw.index("</div>", beg) else: beg = raw.index('zzz="') + 5 beg = raw.index('"', beg) + 2 end = raw.index("<!", beg) raw = raw[beg:end] raw = raw.replace("<BR>", "\n").replace("<br />", "\n") raw = format(raw.decode("gbk")).encode("gbk") sio = StringIO(raw) txt += title + "\n\n" for line in sio: txt += line.strip() + '\n' txt += '\n' print "Done crawling", title common.random_sleep(2) with open(book_title + ".txt", "w") as outf: outf.write(txt)
def main(): config = ConfigParser.RawConfigParser() config.read("config/users.ini") use_proxy = config.getboolean("default", "proxy") common.prepare(use_proxy=use_proxy) client = MongoClient() db = client.topcoder print "Crawling users..." print "Current:", db.users.count() invalid = set() if os.path.exists("config/invalid_handles"): for line in open("config/invalid_handles"): line = line.strip() if line: invalid.add(line) handles = set() for challenge in db.challenges.find(): for reg in challenge["registrants"]: handle = reg["handle"].lower() if u' ' in handle or u'/' in handle or u'\\' in handle: continue if handle in invalid: continue if handle in handles: continue if db.users.find_one({u"handle": handle}): continue handles.add(handle) print len(handles), "users to be crawled." print "-----" for handle in handles: print handle while True: try: request = common.make_request(u"/v3.0.0/members/" + quote(handle)) s = urllib2.urlopen(request).read().decode("utf-8") d = common.to_json(s)[u"result"][u"content"] refine_user(d) user_skills(d) db.users.insert_one(d) common.random_sleep(1) break except urllib2.HTTPError, e: if e.code == 404 or e.code == 403: invalid.add(handle) with open("config/invalid_handles", "w") as fp: for h in sorted(invalid): fp.write(h + '\n') common.random_sleep(1) break else: print "HTTP Error", e.code, e.msg print e.geturl() print e.fp.read() except Exception, e: print "An unknown exception occurred." print e common.random_sleep(20)
def __random_sleep(self): if self.random_sleep_max is not None: co.random_sleep(1, self.random_sleep_max)
def main(): common.prepare(use_proxy=g_config.use_proxy) client = MongoClient() db = client.topcoder print "Crawling users..." print "Current:", db.users.count() if g_config.recrawl_all: print "Recrawl all users" if g_config.recheck_invalid_handles: print "Recheck invalid handles" invalid = set() def add_invalid_handle(hdl): invalid.add(hdl) with open(INVALID_HANDLES_FPATH, "w") as fp: for h in sorted(invalid): try: fp.write(h.encode("utf-8") + '\n') except UnicodeDecodeError: pass if os.path.exists(INVALID_HANDLES_FPATH): for line in open(INVALID_HANDLES_FPATH): line = line.strip() if line: invalid.add(line.decode("utf-8")) handles = set() query = {u"handle": None} field = {u"_id": 1} nb_challeges = db.challenges.count() for index, challenge in enumerate(db.challenges.find()): if (index + 1) % 100 == 0: print "Challenges: %d/%d" % (index + 1, nb_challeges) for reg in challenge[u"registrants"]: handle = reg[u"handle"].lower() for ch in ur" \/": if ch in handle: continue if handle in invalid: continue if handle in handles: continue if not g_config.recrawl_all: query[u"handle"] = handle if db.users.find_one(query, field) is not None: continue handles.add(handle) if g_config.recheck_invalid_handles or g_config.recrawl_all: handles.update(invalid) invalid = set() if os.path.exists(INVALID_HANDLES_FPATH): os.rename(INVALID_HANDLES_FPATH, INVALID_HANDLES_FPATH + ".bak") print len(handles), "users to be crawled" print "-----" for index, handle in enumerate(handles): print "[%d/%d]" % (index + 1, len(handles)), handle while True: try: try: quoted = quote_handle(handle) except KeyError: add_invalid_handle(handle) break request = common.make_request(u"/v3/members/" + quoted) s = common.open_request_and_read(request).decode("utf-8") d = common.to_json(s)[u"result"][u"content"] try: refine_user(d) user_skills(d) user_stats(d) user_external_accounts(d) except: traceback.print_exc() add_invalid_handle(handle) common.random_sleep(DOZE) break db.users.insert_one(d) common.random_sleep(DOZE) break except urllib2.HTTPError, e: if e.code in ( 404, 403, ): add_invalid_handle(handle) common.random_sleep(DOZE) break else: print "HTTP Error", e.code, e.msg print e.geturl() print e.fp.read() except KeyboardInterrupt: return except:
404, 403, ): add_invalid_handle(handle) common.random_sleep(DOZE) break else: print "HTTP Error", e.code, e.msg print e.geturl() print e.fp.read() except KeyboardInterrupt: return except: traceback.print_exc() common.random_sleep(ERROR_WAIT) if __name__ == "__main__": while True: # noinspection PyBroadException try: main() break except KeyboardInterrupt: break except: traceback.print_exc()
def main(): client = MongoClient() db = client.topcoder config = ConfigParser.RawConfigParser() config.read("config/challenges.ini") init = config.getboolean("default", "init") if init: index = config.getint("default", "page_index") else: index = 1 use_proxy = config.getboolean("default", "use_proxy") common.prepare(use_proxy=use_proxy) while True: path = "/v2/challenges/past?type=develop&pageIndex=%d&pageSize=10" % index raw = common.guarded_read(path) if '"data": []' in raw: return print "Page", index lists = json.loads(raw) for challenge in lists["data"]: cid = challenge["challengeId"] if filter_out(cid): continue if db.challenges.find_one({"challengeId": cid}): if init: continue else: return common.random_sleep(1) print ' ', challenge["challengeName"] path = "/v2/challenges/" + str(cid) d = common.to_json(common.guarded_read(path)) path = "/v2/challenges/registrants/" + str(cid) raw = '{"registrants": %s}' % common.guarded_read(path) registrants = common.to_json(raw) path = "/v2/challenges/submissions/" + str(cid) submissions = common.to_json(common.guarded_read(path)) d.update(registrants) d.update(submissions) format_challenge(d) db.challenges.insert_one(d) index += 1 if init: config.set("default", "page_index", index) with open("config/challenges.ini", "wb") as fp: config.write(fp) common.random_sleep(10)
def main(): common.prepare(use_proxy=g_config.use_proxy) client = MongoClient() db = client.topcoder print "Crawling users..." print "Current:", db.users.count() if g_config.recrawl_all: print "Recrawl all users" if g_config.recheck_invalid_handles: print "Recheck invalid handles" invalid = set() def add_invalid_handle(hdl): invalid.add(hdl) with open(INVALID_HANDLES_FPATH, "w") as fp: for h in sorted(invalid): try: fp.write(h.encode("utf-8") + '\n') except UnicodeDecodeError: pass if os.path.exists(INVALID_HANDLES_FPATH): for line in open(INVALID_HANDLES_FPATH): line = line.strip() if line: invalid.add(line.decode("utf-8")) handles = set() query = {u"handle": None} field = {u"_id": 1} nb_challeges = db.challenges.count() for index, challenge in enumerate(db.challenges.find()): if (index + 1) % 100 == 0: print "Challenges: %d/%d" % (index + 1, nb_challeges) for reg in challenge[u"registrants"]: handle = reg[u"handle"].lower() for ch in ur" \/": if ch in handle: continue if handle in invalid: continue if handle in handles: continue if not g_config.recrawl_all: query[u"handle"] = handle if db.users.find_one(query, field) is not None: continue handles.add(handle) if g_config.recheck_invalid_handles or g_config.recrawl_all: handles.update(invalid) invalid = set() if os.path.exists(INVALID_HANDLES_FPATH): os.rename(INVALID_HANDLES_FPATH, INVALID_HANDLES_FPATH + ".bak") print len(handles), "users to be crawled" print "-----" for index, handle in enumerate(handles): print "[%d/%d]" % (index + 1, len(handles)), handle while True: try: try: quoted = quote_handle(handle) except KeyError: add_invalid_handle(handle) break request = common.make_request(u"/v3/members/" + quoted) s = common.open_request_and_read(request).decode("utf-8") d = common.to_json(s)[u"result"][u"content"] try: refine_user(d) user_skills(d) user_stats(d) user_external_accounts(d) except: traceback.print_exc() add_invalid_handle(handle) common.random_sleep(DOZE) break db.users.insert_one(d) common.random_sleep(DOZE) break except urllib2.HTTPError, e: if e.code in (404, 403,): add_invalid_handle(handle) common.random_sleep(DOZE) break else: print "HTTP Error", e.code, e.msg print e.geturl() print e.fp.read() except KeyboardInterrupt: return except:
break except urllib2.HTTPError, e: if e.code in (404, 403,): add_invalid_handle(handle) common.random_sleep(DOZE) break else: print "HTTP Error", e.code, e.msg print e.geturl() print e.fp.read() except KeyboardInterrupt: return except: traceback.print_exc() common.random_sleep(ERROR_WAIT) if __name__ == "__main__": while True: # noinspection PyBroadException try: main() break except KeyboardInterrupt: break except: traceback.print_exc()