def CreateLiveBlob(file_name): with ProfileDb(file_name) as profileDb: profileDb.Clear() stringMap = StringMap() progress = Progress() for section in Progress.SECTIONS: profileDb.FillSection(section,progress.getIds(section)) profileDb.FillStrings("Fetishes",stringMap.getSection("Fetish")) pids = set(progress.getIds("CompletedProfiles")) sys.stderr.write("Profiles to load: [%s]\n" % len(pids)) ploaded = 0 pfailed = 0 ptotal = len(pids) for pid in pids: profile = Profile(pid) if(profile.load()): profileDb.AddProfile(profile) ploaded += 1 sys.stderr.write("Progress - Loaded Profile [%12s], [%12s] of [%12s], [%3s%% Done]\n" % (pid,ploaded,ptotal,100*(ploaded+pfailed)/ptotal)) else: progress.errorProfile(pid) pfailed += 1 sys.stderr.write("Progress - Failed Profile [%12s], [%12s] of [%12s], [%s%% Done]\n" % (pid,pfailed,ptotal,100*(ploaded+pfailed)/ptotal)) del profile gids = set(progress.getIds("CompletedGroups")) sys.stderr.write("Groups to load: [%s]\n" % len(gids)) gloaded = 0 gfailed = 0 gtotal = len(gids) for gid in gids: group = Group(gid) if(group.load()): profileDb.AddGroup(group) gloaded += 1 sys.stderr.write("Progress - Loaded Group [%12s], [%12s] of [%12s], [%3s%% Done]\n" % (gid,gloaded,gtotal,100*(gloaded+gfailed)/gtotal)) else: progress.errorGroup(gid) failed += 1 sys.stderr.write("Progress - Failed Group [%12s], [%12s] of [%12s], [%s%% Done]\n" % (gid,gfailed,gtotal,100*(gloaded+gfailed)/gtotal)) del group sys.stderr.write("Loaded [%d] Profiles [%d] Groups [%d] Errors.\n" % (ploaded,gloaded,pfailed+gfailed)) return profileDb
def fill(self,session): sys.stderr.write("Loading Profile [%s]\n" % self.Id) assert isinstance(self.Id,numbers.Number) self._link = "https://fetlife.com/users/%s" % self.Id self._page = session.get(self._link) if self._page.url != self._link: sys.stderr.write("Missing Profile [%s]\n" % self.Id) return False tree = html.fromstring(self._page.text) self.Name = tree.xpath('//h2[@class="bottom"]/text()')[0].strip() rawPair = tree.xpath('//span[@class="small quiet"]/text()')[0].strip() splitList = re.split(" ",rawPair) if len(splitList) > 1: self.Type = splitList[1] try: self.Age = int(re.sub(r'[^0-9]','', splitList[0])) except ValueError: self.Age = -1 if self.Age != splitList[0]: self.Gender = re.sub(r'[0-9 ]','', splitList[0]) Location = tree.xpath('//div[@class="span-13 append-1"]/p/em/a/text()') self.Location = [unicode(x) for x in Location] table = tree.xpath('//div[@class="span-13 append-1"]/table/tr') for item in table: children = [x for x in item] header = children[0] if header.text == "relationship status:" or header.text == "D/s relationship status:": assert len(children[1:]) == 1 td = children[1] assert len(td.getchildren()) == 1 ul = td.getchildren()[0] for li in ul: if len(li.getchildren()) == 1: a = li.getchildren()[0] url = a.get("href") rel = li.text.strip() pid = int(re.sub(r'[^0-9 ]','', url)) self.Relationships.append(tuple([pid,rel])) elif header.text == "orientation:": assert len(children[1:]) == 1 td = children[1] self.Orientation = td.text elif header.text == "active:": assert len(children[1:]) == 1 td = children[1] self.Active = td.text elif header.text == "is looking for:": assert len(children[1:]) == 1 td = children[1] for text in td.itertext(): self.LookingFor.append(text) else: raise RuntimeError,"Unknown table [%s]" % header.text lastActive = tree.xpath('//ul[@id="mini_feed"]/li/span[@class="quiet small"]/text()') if len(lastActive) != 0: self.setLastActive(lastActive[0]) for groupURL in tree.xpath('//li/a[contains(@href,"/groups/")]/@href'): try: self.Groups.add(int(re.sub(r'[^0-9]','', groupURL))) except ValueError: pass #--------------------------------------------------- # Fetishes #--------------------------------------------------- stringMap = StringMap() stuff = tree.xpath('//em[text()="Into:"]/ancestor::p') if len(stuff) != 0: #sys.stderr.write("Into [%s]\n" % stuff) intoList = [] for item in stuff[0]: if item.text is None: continue #sys.stderr.write("\t[%s][%s]\n" % (item,item.text)) try: if "href" in item.keys(): fetishName = item.text fetishId = int(re.sub(r'[^0-9 ]','', item.get("href"))) intoList.append( [fetishId,None] ) if not stringMap.hasString("Fetish",fetishId): stringMap.addString("Fetish",fetishId,fetishName) elif len(intoList) > 0: intoList[-1][1] = item.text[1:-1] except ValueError: pass #sys.stderr.write("\n%s\n" % intoList) for (k,v) in intoList: if v not in self.Into: self.Into[v] = set() self.Into[v].add(k) stuff = tree.xpath('//em[text()="Curious about:"]/ancestor::p') if len(stuff) != 0: #sys.stderr.write("Curious About [%s]\n" % stuff) curiousList = [] for item in stuff[0]: if item.text is None: continue #sys.stderr.write("\t[%s][%s] - [%s]\n" % (item,item.text,item.keys())) try: if "href" in item.keys(): fetishName = item.text fetishId = int(re.sub(r'[^0-9 ]','', item.get("href"))) curiousList.append( [fetishId,None] ) if not stringMap.hasString("Fetish",fetishId): stringMap.addString("Fetish",fetishId,fetishName) elif len(curiousList) > 0: curiousList[-1][1] = item.text[1:-1] except ValueError: pass #sys.stderr.write("\n%s\n" % curiousList) for (k,v) in curiousList: if v not in self.Curious: self.Curious[v] = set() self.Curious[v].add(k) #--------------------------------------------------- # Now, friends #--------------------------------------------------- pageNum = 1 while True: self._link = "https://fetlife.com/users/%s/friends?page=%d" % (self.Id,pageNum) self._page = session.get(self._link) tree = html.fromstring(self._page.text) urls = tree.xpath('//div[@class="clearfix user_in_list"]/div/a/@href') for url in urls: friend = int(re.sub(r'[^0-9 ]','', url)) self.Friends.append(friend) next = tree.xpath('//a[@class="next_page"]') if len(next) == 1: pageNum += 1 else: break self.setCrawlDate() sys.stderr.write("Done Loading Profile [%s]\n" % self.Id) return True
while not progress.getExit(): crawler.doTick() sys.stderr.write("Ending Crawler\n") progress.saveProgress() progress.setExit() else: def RunCrawler(num): crawler = Crawler(session,progress) sys.stderr.write("Starting Crawler [%d]\n" % num) while not progress.getExit(): crawler.doTick() sys.stderr.write("Ending Crawler [%d]\n" % num) progress.setExit() progress = Progress() stringMap = StringMap() progress.printProgress() threads = [] for i in range(options.threads): threads.append(Thread(None,target=RunCrawler,args=(i,))) threads[-1].start() try: while not progress.getExit(): time.sleep(60) progress.printProgress() progress.saveProgress() stringMap.save() except: sys.stderr.write("Shutting down from main thread\n") progress.setExit()