def getNumberOfPages(self, delay=10): url = settings.eksiUrl + self.path + "?p=2" firstPage = helper.urlopen(url, delay=delay) html = firstPage.read() soup = BeautifulSoup(html, "html.parser") numberOfPages = int(soup.find("div", class_="pager")["data-pagecount"]) return numberOfPages
def getListeBasliksFromUrl(self, url, delay=10): try: page = helper.urlopen(url, delay=delay) except: logging.error("Failed to open url: " + url) return [] html = page.read() soup = BeautifulSoup(html, "html.parser") result = [] resultBasliks = [] resultListeBasliks = [] # timestamp = int(soup.find("ul", class_="topic-list").attrs["data-timestamp"]) # print(timestamp) # self.timestamp = helper.eksiTimestampToUnixTimestamp(timestamp) for i in soup.find_all("ul", class_="topic-list")[-1].find_all("li"): path = i.find("a").attrs["href"] if path[0] == "/": path = path[1:] try: if i.find("a").attrs["class"][0] == "sponsored": continue except: pass name = path.rsplit("?", 1)[0].rsplit("--",1)[0] id_ = path.rsplit("?", 1)[0].rsplit("--",1)[1] count = 0 if i.find("small"): # counter = int(i.find("small").getText().encode("utf-8").strip()) count = int(i.find("small").getText().strip()) i.small.decompose() text = i.text.strip() b = Baslik(name=name, text=text, id_=id_) # print(self.id_) l = ListeBaslik(count=count, baslik_id=b.id_, liste_id=self.id_, path=path) resultBasliks.append(b) resultListeBasliks.append(l) return resultBasliks, resultListeBasliks
def getEntriesFromUrl(url, delay=10): # print("Starting") try: page = helper.urlopen(url, delay=delay) except: logging.error("Failed to open url: " + url) return [] # print("Downloaded") html = page.read() soup = BeautifulSoup(html, "html.parser") result = [] baslik_id = soup.find('h1', id='title').attrs['data-id'] # baslik_id = soup.find('h1', id='title').attrs['data-slug'] baslikText = soup.find('h1', id='title').attrs['data-title'].strip() for j in soup.find('ul', id='entry-list').find_all('li'): # import re # i = re.sub('<br\s*?>', '\n', i) # i = j.find('article') i = j # number = int(j.attrs['value']) id_ = str(i.attrs['data-id']) text = textWithNewlines(i.find('div', class_='content')) text = text.strip() author = str(i.attrs['data-author']) # author = i.find('span', itemprop='name').getText().strip() favoriteCount = int(i.attrs['data-favorite-count']) # date = i.find('a',class_='entry-date').find('time',class_='creation-time').attrs['datetime'] dateText = i.find('a',class_='entry-date').text try: timestamp = helper.datetimeToTimestamp(getDatetimesFromEntryDate(dateText)[0]) except: timestamp = None # datetimeObject = datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%S') # text = " ".join(i.find_all(text=lambda t: True)).encode('utf-8').strip() result.append(Entry(text = text, author = author, timestamp = timestamp, baslik_id = baslik_id, favoriteCount = favoriteCount, id_ = id_)) return result
def getBasliksFromUrl(url, delay=10): try: page = helper.urlopen(url, delay=delay) except: logging.error("Failed to open url: " + url) return [] html = page.read() soup = BeautifulSoup(html, "html.parser") result = [] for i in soup.find_all('ul', class_='topic-list')[-1].find_all('li'): path = i.find('a').attrs['href'] # print(i.find('a').attrs['class']) try: if i.find('a').attrs['class'][0] == "sponsored": continue except: pass name = path[1:].rsplit('?', 1)[0].rsplit('--',1)[0] id_ = path[1:].rsplit('?', 1)[0].rsplit('--',1)[1] counter = None if i.find('small'): # counter = int(i.find('small').getText().encode('utf-8').strip()) counter = int(i.find('small').getText().strip()) # print(counter) i.small.decompose() # print(i) text = i.text.strip() # prettyName = i.string.encode('utf-8').strip() # path.rsplit('?', 1) # print(id_) result.append(Baslik(name=name, text=text, id_=id_)) return result