示例#1
0
文件: liste.py 项目: hos/eksitools
    def getNumberOfPages(self, delay=10):
        url = settings.eksiUrl + self.path + "?p=2"
        firstPage = helper.urlopen(url, delay=delay)
        html = firstPage.read()

        soup = BeautifulSoup(html, "html.parser")
        numberOfPages = int(soup.find("div", class_="pager")["data-pagecount"])
        return numberOfPages
示例#2
0
文件: liste.py 项目: hos/eksitools
    def getListeBasliksFromUrl(self, url, delay=10):
        try:
            page = helper.urlopen(url, delay=delay)
        except:
            logging.error("Failed to open url: " + url)
            return []

        html = page.read()
        soup = BeautifulSoup(html, "html.parser")
        result = []

        resultBasliks = []
        resultListeBasliks = []

        # timestamp = int(soup.find("ul", class_="topic-list").attrs["data-timestamp"])

        # print(timestamp)
        # self.timestamp = helper.eksiTimestampToUnixTimestamp(timestamp)

        for i in soup.find_all("ul", class_="topic-list")[-1].find_all("li"):
            path = i.find("a").attrs["href"]

            if path[0] == "/":
                path = path[1:]

            try:
                if i.find("a").attrs["class"][0] == "sponsored":
                    continue
            except:
                pass

            name = path.rsplit("?", 1)[0].rsplit("--",1)[0]
            id_ = path.rsplit("?", 1)[0].rsplit("--",1)[1]
            count = 0
            if i.find("small"):
                # counter = int(i.find("small").getText().encode("utf-8").strip())
                count = int(i.find("small").getText().strip())
                i.small.decompose()

            text = i.text.strip()

            b = Baslik(name=name, text=text, id_=id_)
            # print(self.id_)
            l = ListeBaslik(count=count, baslik_id=b.id_,
                            liste_id=self.id_,
                            path=path)

            resultBasliks.append(b)
            resultListeBasliks.append(l)

        return resultBasliks, resultListeBasliks
示例#3
0
文件: crawlers.py 项目: hos/eksitools
def getEntriesFromUrl(url, delay=10):
    # print("Starting")

    try:
        page = helper.urlopen(url, delay=delay)
    except:
        logging.error("Failed to open url: " + url)
        return []

    # print("Downloaded")

    html = page.read()
    soup = BeautifulSoup(html, "html.parser")
    result = []

    baslik_id = soup.find('h1', id='title').attrs['data-id']
    # baslik_id = soup.find('h1', id='title').attrs['data-slug']

    baslikText = soup.find('h1', id='title').attrs['data-title'].strip()

    for j in soup.find('ul', id='entry-list').find_all('li'):
        # import re
        # i = re.sub('<br\s*?>', '\n', i)
        # i = j.find('article')
        i = j
        # number = int(j.attrs['value'])
        id_ = str(i.attrs['data-id'])

        text = textWithNewlines(i.find('div', class_='content'))
        text = text.strip()

        author = str(i.attrs['data-author'])
        # author = i.find('span', itemprop='name').getText().strip()
        favoriteCount = int(i.attrs['data-favorite-count'])
        # date = i.find('a',class_='entry-date').find('time',class_='creation-time').attrs['datetime']
        dateText = i.find('a',class_='entry-date').text

        try:
            timestamp = helper.datetimeToTimestamp(getDatetimesFromEntryDate(dateText)[0])
        except:
            timestamp = None

        # datetimeObject = datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%S')
        # text = " ".join(i.find_all(text=lambda t: True)).encode('utf-8').strip()
        result.append(Entry(text = text,
                            author = author,
                            timestamp = timestamp,
                            baslik_id = baslik_id,
                            favoriteCount = favoriteCount,
                            id_ = id_))
    return result
示例#4
0
文件: crawlers.py 项目: hos/eksitools
def getBasliksFromUrl(url, delay=10):
    try:
        page = helper.urlopen(url, delay=delay)
    except:
        logging.error("Failed to open url: " + url)
        return []

    html = page.read()
    soup = BeautifulSoup(html, "html.parser")
    result = []


    for i in soup.find_all('ul', class_='topic-list')[-1].find_all('li'):
        path = i.find('a').attrs['href']
        # print(i.find('a').attrs['class'])
        try:
            if i.find('a').attrs['class'][0] == "sponsored":
                continue
        except:
            pass

        name = path[1:].rsplit('?', 1)[0].rsplit('--',1)[0]
        id_ = path[1:].rsplit('?', 1)[0].rsplit('--',1)[1]
        counter = None
        if i.find('small'):
            # counter = int(i.find('small').getText().encode('utf-8').strip())
            counter = int(i.find('small').getText().strip())
            # print(counter)
            i.small.decompose()
        # print(i)

        text = i.text.strip()
        # prettyName = i.string.encode('utf-8').strip()
        # path.rsplit('?', 1)
        # print(id_)
        result.append(Baslik(name=name, text=text, id_=id_))

    return result