def extract_profil_from_bellefaye(firstname, lastname): page = wikipedia.BeautifulSoup( wikipedia.requests.post( "https://www.bellefaye.com/fr/login_check", data= "_csrf_token=c8FvlHO5q-f0XpbhG2lQJifHlmhei_qpGO3WcaLgPqE&_username=h.hoareau%40femis.fr&_password=Femis2021&_submit=", headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36', 'Accept': 'text/html', 'Content-Type': 'application/x-www-form-urlencoded' }).text, "html5lib") url = "https://www.bellefaye.com/fr/search" data = "name=%name%&firstName=%firstname%&searchCity=&searchZipCode=&searchEmail=&searchGender=&findPerson=&searchName=&searchCity2=&searchZipCode2=&searchEmail2=" data = data.replace("%name%", lastname).replace("%firstname%", firstname) page = wikipedia.BeautifulSoup( wikipedia.requests.post( url, data=data, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36', 'Accept': 'text/html', 'Content-Type': 'application/x-www-form-urlencoded' }).text, "html5lib") print(page.text) pass
def extract_profil_from_cnca(title): """ Extraction sur la base http://www.cnc-rca.fr/Pages/PageAccueil.aspx :param firstname: :param lastname: :return: """ page = wikipedia.BeautifulSoup( wikipedia.requests.get( "http://www.cnc-rca.fr/Pages/Page.aspx?view=RecOeuvre", headers={ 'User-Agent': 'Mozilla/5.0' }).text, "html5lib") return title
def extract_actor_from_wikipedia(lastname, firstname): wikipedia.set_lang("fr") searchs = wikipedia.search(lastname + " " + firstname) for search in searchs: page = wikipedia.page(search) rc = {"links": list({"title": "wikipedia", "url": page.url})} if lastname in page.title and firstname in page.title: rc = dict({"links": [], "name": firstname + " " + lastname}) for img in page.images: if img.endswith(".jpg"): rc["photo"] = img save_domains = [ "unifrance.org", "www.lefilmfrancais", "www.allocine.fr", "catalogue.bnf.fr", "www.allmovie.com" ] libs = [ "UniFrance", "Le Film Francais", "Allocine", "La BNF", "All movie" ] try: for ref in page.references: domain = urlparse(ref).netloc try: idx = save_domains.index(domain) rc["links"].append({"title": libs[idx], "url": ref}) except: pass except: pass html: wikipedia.BeautifulSoup = wikipedia.BeautifulSoup( page.html(), "html5lib") #Recherche de la section des films # for link in html.findAll('a', attrs={'href': wikipedia.re.compile("^http://")}): # if "film" in link.text: # pass rc["summary"] = page.summary rc["title"] = page.title rc["url"] = page.url return rc return None
def extract_profil_from_unifrance(name="céline sciamma", refresh_delay=31): page = load_page( "https://www.unifrance.org/recherche/personne?q=$query&sort=pertinence" .replace("$query", parse.quote(name)), refresh_delay=refresh_delay) links = page.findAll( 'a', attrs={ 'href': wikipedia.re.compile( "^https://www.unifrance.org/annuaires/personne/") }) rc = list() if len(links) > 0: u = links[0].get("href") page = wikipedia.BeautifulSoup( wikipedia.requests.get(u, headers={ 'User-Agent': 'Mozilla/5.0' }).text, "html5lib") if equal_str(name, page.title.text.split("-")[0]) or equal_str( name, links[0].text.split("Activités : ")[0]): photo = "" _photo = page.find('div', attrs={'class': "profil-picture pull-right"}) if not _photo is None: photo = _photo.find("a").get("href") links_film = page.findAll( 'a', attrs={ 'href': wikipedia.re.compile( "^https://www.unifrance.org/film/[0-9][0-9]*/") }) for l in links_film: rc.append({ "url": l.get("href"), "text": l.get("text"), "nature": "" }) return {"links": rc, "photo": photo, "url": u} return None