def scrape(project_id): db = Database() if db.get_project(project_id): # print("Project {} exists in the datatbase".format(project_id)) return True url = "https://smutba.se/project/{}/".format(project_id) page_html = requests.get(url) if page_html.status_code == 404: print("Project {} does not exist on the site".format(project_id)) return False page_soup = soup(page_html.content, "html5lib") project = Project(project_id) project.title = page_soup.find("h1", {"id": "file_title"}).text images_e = page_soup.find_all("img", {"class": "project-detail-image-main"}) for e in images_e: project.images.append(BASE_URL + e["src"]) project.description = page_soup.find("div", {"class": "panel__body"}).decode_contents() user_id = page_soup.find("h4", {"class": "panel__avatar-title"}).find("a").get("href", "").split("/")[-2] user = User(user_id) user.name = page_soup.find("span", {"class": "username"}).text user.add_user() project.user = user info = page_soup.find("div", {"class": "panel__footer"}).find_all("dd") project.posted = info[0].text project.views = info[1].text project.category = info[2].text project.licence = info[3].text trs = page_soup.find("tbody").find_all("tr") for i in range(0, len(trs), 2): tr = trs[i].find_all("td") if len(tr) < 4: break filename = tr[0].strong.text downloads = tr[1].text created = tr[2].text filesize = tr[3].text links = trs[i+1].find_all("a") download = Download(filename) download.downloads = downloads download.created = created download.filesize = filesize download.project_id = project.id for link in links: download.urls.append(BASE_URL + link.get("href", "")) download.add_download() success = project.add_project() if not success: print("Project {} was not successfully added to the database".format(project_id)) print("Project {} added to the database".format(project_id))