def collect(url): r = requests.get(url, cookies=cookie, verify=False) tree = html.fromstring(r.content) postURI = tree.xpath('.//a[text()="Full Story"]') nextPage = tree.xpath('.//a/span[text()="See More Posts"]')[0] for x in postURI: print x.xpath('@href') curUrl = nextPage.getparent().xpath('@href')[0] finURL = "/groups/625191517538301?bacr=1388816159%3A633308973393222&refid=18" if curUrl != finURL: for x in postURI: link = x.xpath('@href')[0] newLink = Permalink.create(slug = link) newLink.save() print "" print curUrl print "" collect("https://mbasic.facebook.com" +curUrl)
comment = x.xpath('.//div[1]')[0] c = etree.tostring(comment) try: like = x.xpath('.//a[@aria-label="Like"]/text()')[0] except IndexError: like = str(0) timestamp = x.xpath('.//abbr/text()')[0] # commentt = Comment(potato=pot, # helper=name, # helper_slug=link, # power=like, # timestamp=timestamp, # answer=c) # commentt.save() print c # print name, link, like, timestamp, c # print etree.tostring(x, pretty_print=True) except: pass # print etree.tostring(description, pretty_print=True) # print etree.tostring(base, pretty_print=True) except: pass if __name__ == "__main__": # collect("https://mbasic.facebook.com/groups/625191517538301") # scrape("https://m.facebook.com/groups/625191517538301?view=permalink&id=973219502735499&refid=18&_ft_=qid.6204099145172029259%3Amf_story_key.973219502735499%3Atl_objid.973219502735499#footer_action_list") for i, x in enumerate(Permalink.select()): print i, x.slug scrape("https://mbasic.facebook.com" + x.slug)