示例#1
0
def scrape(project_id):
    db = Database()
    if db.get_project(project_id):
        # print("Project {} exists in the datatbase".format(project_id))
        return True

    url = "https://smutba.se/project/{}/".format(project_id)
    page_html = requests.get(url)
    if page_html.status_code == 404:
        print("Project {} does not exist on the site".format(project_id))
        return False
    page_soup = soup(page_html.content, "html5lib")

    project = Project(project_id)
    project.title = page_soup.find("h1", {"id": "file_title"}).text
    images_e = page_soup.find_all("img", {"class": "project-detail-image-main"})
    for e in images_e:
        project.images.append(BASE_URL + e["src"])
    project.description = page_soup.find("div", {"class": "panel__body"}).decode_contents()

    user_id = page_soup.find("h4", {"class": "panel__avatar-title"}).find("a").get("href", "").split("/")[-2]
    user = User(user_id)
    user.name = page_soup.find("span", {"class": "username"}).text
    user.add_user()
    project.user = user

    info = page_soup.find("div", {"class": "panel__footer"}).find_all("dd")
    project.posted = info[0].text
    project.views = info[1].text
    project.category = info[2].text
    project.licence = info[3].text

    trs = page_soup.find("tbody").find_all("tr")
    for i in range(0, len(trs), 2):
        tr = trs[i].find_all("td")
        if len(tr) < 4:
            break
        filename = tr[0].strong.text
        downloads = tr[1].text
        created = tr[2].text
        filesize = tr[3].text
        links = trs[i+1].find_all("a")

        download = Download(filename)
        download.downloads = downloads
        download.created = created
        download.filesize = filesize
        download.project_id = project.id

        for link in links:
            download.urls.append(BASE_URL + link.get("href", ""))
        download.add_download()
    success = project.add_project()
    if not success:
        print("Project {} was not successfully added to the database".format(project_id))
    print("Project {} added to the database".format(project_id))