Python load_page примеры, OpenAlumni.Tools.load_page Python примеры использования

Пример #1

0

Показать файл

Файл: Batch.py Проект: f80dev/DataCulturePro

def extract_movie_from_bdfci(pow: PieceOfWork, refresh_delay=31):
    title = pow.title.replace(" ", "+")
    page = load_page("https://www.bdfci.info/?q=" + title +
                     "&pa=f&d=f&page=search&src=bdfci&startFrom=1&offset=1",
                     refresh_delay=refresh_delay)
    articles = page.find_all("article")
    url_ref = None
    if len(articles) == 0:
        entete = page.find("h1")
        if not entete is None:
            text_entete = entete.text.split("<")[0].lower()
            if text_entete == pow.title.lower():
                url_ref = page
    else:
        url = articles[0].find("a")
        if url is not None and url.attrs["title"].lower() == str(
                pow.title).lower():
            url_ref = "https://www.bdfci.info" + url.attrs["href"]

    if url_ref is not None:
        pow.add_link(url_ref, "BDFI")
        log("Ajout du lien BDFCI:" + url_ref + " pour " + pow.title)
        pow.dtLastSearch = datetime.now()
        pow.save()

    return title

Пример #2

0

Показать файл

Файл: Batch.py Проект: f80dev/DataCulturePro

def extract_film_from_senscritique(title: str, refresh_delay=31):
    url = "https://www.senscritique.com/search?q=" + urlencode(title.lower())
    log("Recherche sur sens-critique : " + url)
    pages = load_page(url, save=False)
    pages = pages.find_all("div", {"data-qa": "hits"})
    if len(pages) > 0:
        links = pages[0].find_all("a")
        url = ""
        for l in links:
            if "href" in l.attrs and l.attrs["href"].startswith(
                    "https://www.senscritique.com/film/"):
                if l.getText().lower() == title.lower():
                    url = l["href"]
                    log("Extraction de " + url)
                    page = load_page(url, refresh_delay)
                    return url
    return None

Пример #3

0

Показать файл

def extract_profil_from_imdb(lastname: str, firstname: str):
    peoples = ia.search_person(firstname + " " + lastname)
    infos = dict()
    for p in peoples:
        name = p.data["name"].upper()
        if firstname.upper() in name and lastname.upper() in name:
            if not "nopicture" in p.data["headshot"]:
                infos["photo"] = p.data["headshot"]
            if not "url" in infos:
                infos["url"] = "https://imdb.com/name/nm" + p.personID + "/"
                log("Ouverture de " + infos["url"])
                page = load_page(infos["url"])
                film_zone = page.find("div", {"id": "filmography"})
                if film_zone is None: film_zone = page

                links = film_zone.findAll(
                    'a', attrs={'href': wikipedia.re.compile("^/title/tt")})
                infos["links"] = []
                for l in links:
                    if len(
                            l.getText()
                    ) > 3 and l.parent.parent.parent.parent and l.parent.parent.parent.parent[
                            "id"] == "filmography":
                        texts = l.parent.parent.text.split("(")
                        nature = "long"
                        job: str = l.parent.parent.get("id").split("-")[0]
                        if job == "miscellaneous" or len(job) == 0:
                            temp = l.parent.parent.text.split("(")
                            job = temp[len(temp) - 1].split(")")[0]
                            pass

                        url = "https://www.imdb.com" + l.get("href")
                        url = url.split("?")[0]

                        if len(texts) > 1:
                            nature = ""
                            for nat in MOVIE_NATURE:
                                if nat.lower() in texts[1].lower():
                                    nature = nat
                                    break
                            if nature == "":
                                log("Nature inconnue depuis " + texts[1] +
                                    " pour " + url)

                            if len(texts) > 2 and len(job) == 0:
                                job = texts[2].split(")")[0]

                        infos["links"].append({
                            "url": url,
                            "text": l.getText(),
                            "job": job,
                            "nature": nature
                        })

    return infos

Пример #4

0

Показать файл

Файл: Batch.py Проект: f80dev/DataCulturePro

def extract_profil_from_lefimlfrancais(firstname, lastname, refresh_delay=31):
    rc = dict()
    url = "http://www.lefilmfrancais.com/index.php?option=com_papyrus&view=recherche&task=json&tmpl=rss&term=" + firstname + "+" + lastname
    data = load_json(url)
    if len(data) > 1:
        rc["url"] = data[0]["link"]
        page = load_page(rc["url"], refresh_delay=refresh_delay)
        rc["links"] = []
        for l in page.find_all("a"):
            if l.attrs["href"].startswith(
                    "http://www.lefilmfrancais.com/film/"):
                rc["links"].append({
                    "text": l.text,
                    "url": l.attrs["href"],
                    "source": "LeFilmFrancais"
                })
    return rc

Пример #5

0

Показать файл

Файл: Batch.py Проект: f80dev/DataCulturePro

def extract_profil_from_imdb(lastname: str, firstname: str, refresh_delay=31):
    peoples = ia.search_person(
        remove_accents(firstname) + " " + remove_accents(lastname))
    infos = dict()
    for p in peoples:
        name = remove_accents(remove_ponctuation(p.data["name"].upper()))
        if firstname.upper() in name and lastname.upper() in name:
            if not "nopicture" in p.data["headshot"]:
                infos["photo"] = p.data["headshot"]
            if not "url" in infos:
                infos["url"] = "https://imdb.com/name/nm" + p.personID + "/"
                log("Ouverture de " + infos["url"])
                page = load_page(infos["url"], refresh_delay=refresh_delay)
                film_zone = page.find("div", {"id": "filmography"})
                if film_zone is None: film_zone = page

                #Contient l'ensemble des liens qui renvoi vers une oeuvre
                infos["links"] = []
                links = film_zone.findAll(
                    'a', attrs={'href': wikipedia.re.compile("^/title/tt")})
                for l in links:
                    if len(
                            l.getText()
                    ) > 3 and l.parent.parent.parent.parent and l.parent.parent.parent.parent[
                            "id"] == "filmography":
                        texts = l.parent.parent.text.split("(")
                        nature = "long"
                        job: str = l.parent.parent.get("id").split("-")[0]
                        if job == "miscellaneous" or len(job) == 0:
                            temp = l.parent.parent.text.split("(")
                            job = temp[len(temp) - 1].split(")")[0]
                            pass
                        else:
                            if not in_dict(job, "jobs"): job = ""

                        url = "https://www.imdb.com" + l.get("href")
                        url = url.split("?")[0]

                        infos["links"].append({
                            "url": url,
                            "text": l.getText(),
                            "job": "",
                            "nature": ""
                        })
    return infos

Пример #6

0

Показать файл

Файл: Batch.py Проект: f80dev/DataCulturePro

def extract_profil_from_unifrance(name="céline sciamma", refresh_delay=31):
    page = load_page(
        "https://www.unifrance.org/recherche/personne?q=$query&sort=pertinence"
        .replace("$query", parse.quote(name)),
        refresh_delay=refresh_delay)
    links = page.findAll(
        'a',
        attrs={
            'href':
            wikipedia.re.compile(
                "^https://www.unifrance.org/annuaires/personne/")
        })

    rc = list()
    if len(links) > 0:
        u = links[0].get("href")
        page = wikipedia.BeautifulSoup(
            wikipedia.requests.get(u, headers={
                'User-Agent': 'Mozilla/5.0'
            }).text, "html5lib")
        if equal_str(name,
                     page.title.text.split("-")[0]) or equal_str(
                         name, links[0].text.split("Activités : ")[0]):
            photo = ""
            _photo = page.find('div',
                               attrs={'class': "profil-picture pull-right"})
            if not _photo is None: photo = _photo.find("a").get("href")

            links_film = page.findAll(
                'a',
                attrs={
                    'href':
                    wikipedia.re.compile(
                        "^https://www.unifrance.org/film/[0-9][0-9]*/")
                })
            for l in links_film:
                rc.append({
                    "url": l.get("href"),
                    "text": l.get("text"),
                    "nature": ""
                })

            return {"links": rc, "photo": photo, "url": u}

    return None

Пример #7

0

Показать файл

def extract_film_from_unifrance(url: str, job_for=None):
    rc = dict()
    if not url.startswith("http"):
        log("On passe par la page de recherche pour retrouver le titre")
        page = load_page("https://unifrance.org/recherche?q=" +
                         parse.quote(url))
        _link = page.find("a",
                          attrs={
                              'href':
                              wikipedia.re.compile(
                                  "^https://www.unifrance.org/film/[0-9][0-9]")
                          })
        if _link is None: return rc

        url = _link.get("href")

    #r=wikipedia.requests.get(url, headers={'User-Agent': 'Mozilla/5.0',"accept-encoding": "gzip, deflate"})
    #page = wikipedia.BeautifulSoup(str(r.content,encoding="utf-8"),"html5lib")
    page = load_page(url)
    _title = page.find('h1', attrs={'itemprop': "name"})
    if not _title is None:
        rc["title"] = _title.text
        log("Analyse du film " + rc["title"])

    for title in page.findAll('h1'):
        if "Affiches" in title.text:
            section = title.parent
            _img = section.find("img", attrs={'itemprop': "image"})
            if not _img is None:
                src: str = _img.get("src")
                if not src.startswith("/ressource"):
                    rc["visual"] = src
                    log("Enregistrement de l'affiche " + src)

    _real = page.find("div", attrs={"itemprop": "director"})
    if not _real is None:
        rc["real"] = _real.find("a", attrs={"itemprop": "name"}).get("href")

    idx_div = 0
    for div in page.findAll("div", attrs={'class': "details_bloc"}):
        if idx_div == 0:
            if not ":" in div.text: rc["nature"] = div.text

        if "Année de production : " in div.text:
            rc["year"] = div.text.replace("Année de production : ", "")
        if "Genre(s) : " in div.text:
            rc["category"] = translate(div.text.replace("Genre(s) : ", ""))
        idx_div = idx_div + 1

    if "category" in rc and len(rc["category"]) == 0:
        rc["category"] = "inconnue"

    if not job_for is None:
        if rc["real"] == job_for:
            rc["job"] = "Réalisation"
        else:
            section = page.find("section", {"id": "casting"})

            if not section is None:
                jobs = section.findAll("h2")
                paras = section.findAll("p")
                #if not "personne" in links[0].href:links.remove(0)
                for idx in range(len(paras)):
                    links = paras[idx].findAll("a")
                    for l in links:
                        if "/personne" in l.get("href"):
                            if l.get("href") == job_for:
                                rc["job"] = jobs[idx].text.replace(" : ", "")
                                break

    if not "job" in rc:
        pass

    _synopsis = page.find("div", attrs={"itemprop": "description"})
    if not _synopsis is None: rc["synopsis"] = _synopsis.getText(strip=True)

    return rc

Пример #8

0

Показать файл

def extract_film_from_imdb(
    url: str,
    title: str,
    name="",
    job="",
):
    """

    :return:
    """
    page = load_page(url)

    rc = dict({"title": title, "nature": translate("film")})

    zone_info = page.find("div", {"class": "title_block"})
    if title.startswith("Episode") or "Episode" in zone_info.getText():
        section_title = page.find("div", {"class": "titleParent"})
        if not section_title is None:
            title = section_title.find("a").text + " " + title
        #Recherche de l'épisode
        rc["nature"] = MOVIE_NATURE[0]
        zone_info_comp = page.find("div",
                                   {"class": "button_panel navigation_panel"})
        if not zone_info_comp is None and "Season" in zone_info_comp.getText():
            extract_text = "S" + zone_info_comp.getText().split(
                "Season")[1].replace("Episode ", "E").replace(
                    " | ", "").replace(" ", "")
            rc["title"] = title + " " + extract_text.split("\n")[0]

    for cat in MOVIE_CATEGORIES:
        if cat.lower() in zone_info.getText().lower():
            rc["category"] = cat
    if not "category" in rc:
        rc["category"] = "Inconnue"
        log("Pas de categorie pour " + url)

    affiche = page.find("div", attrs={"class": "poster"})
    if not affiche is None and not affiche.find("img") is None:
        rc["visual"] = affiche.find("img").get("src")

    try:
        rc["year"] = re.search('[1-2][0-9][0-9][0-9]',
                               page.title.text).group(0)
    except:
        try:
            rc["year"] = re.search('[1-2][0-9][0-9][0-9]',
                                   zone_info.getText()).group(0)
        except:
            pass

    summary_section = page.find("div", attrs={"class": "summary_text"})
    if not summary_section is None and not "Add a Plot" in summary_section.text:
        rc["synopsis"] = summary_section.text.replace("\n", "").strip()

    log("Recherche du role sur le film")

    credits = load_page(url + "fullcredits")
    if not credits is None:
        credits = credits.find("div", {"id": "main"})
        if not credits is None:
            links = credits.find_all("a")
            for l in links:
                if name.upper() in l.text.upper():
                    parent = l.parent.parent.find("td", {"class": "credit"})
                    if not parent is None:
                        rc["job"] = str(parent.getText().replace("\n",
                                                                 "")).strip()
                        rc["job"] = rc["job"].split("(")[0]
                        while "  " in rc["job"]:
                            rc["job"] = rc["job"].replace("  ", " ")

                    break

    if not "job" in rc: rc["job"] = job

    return rc

Пример #9

0

Показать файл

Файл: Batch.py Проект: f80dev/DataCulturePro

def extract_film_from_imdb(url: str,
                           title: str,
                           name="",
                           job="",
                           all_casting=False,
                           refresh_delay=31):
    """

    :return:
    """
    if not url.startswith("http"):
        page = load_page("https://www.imdb.com/find?s=tt&q=" +
                         parse.quote(url))
        bFind = False
        for link in page.find_all("a"):
            if link and equal_str(
                    link.text, url) and link["href"].startswith("/title/tt"):
                url = "https://www.imdb.com" + link["href"]
                bFind = True
                break
        if not bFind:
            log(url + " introuvable sur IMDB")
            return None

    page = load_page(url, refresh_delay)

    title = remove_ponctuation(title)

    rc = dict({
        "title": title,
        "nature": "",
        "casting": list(),
        "url": url,
        "source": "auto:IMDB"
    })

    divs = dict()
    elts = page.find_all("div", recursive=True) + page.find_all(
        "h1", recursive=True) + page.find_all(
            "ul", recursive=True) + page.find_all("p") + page.find_all("li")
    for div in elts:
        s = div.text
        s_t = translate(s)
        if s_t in MOVIE_NATURE:
            rc["nature"] = s_t
        if s.startswith("1h") or s.startswith("2h") and s.endswith(
                "m") and len(rc["nature"]) == 0:
            rc["nature"] = translate("long")
        if "data-testid" in div.attrs:
            divs[div.attrs["data-testid"]] = div

    #Recherche de la nature et de la catégorie
    if not "genres" in divs:
        elt = page.find("li", {
            "role": "presentation",
            "class": "ipc-inline-list__item"
        })
        if not elt is None:
            cat = elt.text
        else:
            cat = "inconnu"
    else:
        cat = ""
        for div in divs["genres"]:
            cat = cat + translate(div.text.lower()) + " "
        if cat.split(" ")[0] in MOVIE_NATURE:
            rc["nature"] = cat.split(" ")[0]
            cat = cat.replace(rc["nature"], "").strip()

    rc["category"] = cat.strip()

    try:
        title = divs["hero-title-block__title"].text
        year = divs["hero-title-block__metadata"].text
        if not year is None: rc["year"] = re.search(r"(\d{4})", year).group(1)
    except:
        log("Erreur sur title=" + title)
        return None

    affiche = divs["hero-media__poster"]
    if not affiche is None and not affiche.find("img") is None:
        rc["visual"] = affiche.find("img").get("src")

    rc["synopsis"] = ""
    if "plot" in divs:
        rc["synopsis"] = divs["plot"].text.replace("Read all", "")

    #log("Recherche du role sur le film")

    credits = load_page(url + "fullcredits", refresh_delay)
    if not credits is None:
        credits = credits.find("div", {"id": "fullcredits_content"})
        if not credits is None:
            sur_jobs = credits.find_all("h4")
            tables = credits.find_all("table")
            for i in range(0, len(tables)):
                trs = tables[i].find_all("tr")

                for tr in trs:
                    tds = tr.find_all("td")
                    if len(tds) > 1:
                        findname = tds[0].text.replace("\n", "").replace(
                            "  ", " ").strip()
                        if len(findname) == 0:
                            findname = tds[1].text.replace("\n", "").replace(
                                "  ", " ").strip()
                        if len(findname) > 0:
                            #log("Nom identifié "+findname)
                            if equal_str(findname, name):
                                sur_job = sur_jobs[i].text.replace(
                                    "\n", " ").strip()
                                if "Cast" in sur_job or "Serie Cast" in sur_job:
                                    if len(tds) > 3 and "Self" in tds[3].text:
                                        job = ""
                                    else:
                                        job = "Actor"
                                else:
                                    job = tds[len(tds) - 1].text.split(
                                        "(")[0].split("/")[0].strip()
                                    if len(job) == 0 and len(
                                            sur_jobs[i].text) > 0:
                                        job = sur_job.replace(" by",
                                                              "").strip()

                                job = job.split("\n")[0]
                                rc["job"] = translate(job)
                                if len(job) == 0:
                                    log("Job non identifié pour " + name +
                                        " sur " + url)
                                else:
                                    if not all_casting: break
                            else:
                                if all_casting:
                                    names = tds[0].split(" ")
                                    rc["casting"].append({
                                        "name":
                                        " ".join(names),
                                        "source":
                                        "imdb",
                                        "job":
                                        job
                                    })

    if not "job" in rc: rc["job"] = job

    return rc

Пример #10

0

Показать файл

Файл: Batch.py Проект: f80dev/DataCulturePro

def extract_awards_from_imdb(profil_url, profil):
    # Recherche des awards
    page = load_page(profil_url + "awards?ref_=nm_awd")

    awards = page.find_all("h3")
    if len(awards) > 0:
        awards.pop(0)

    tables = page.find_all("table", {"class": "awards"})

    for i in range(0, len(tables)):
        for tr in tables[i].find_all("tr"):
            if tr:
                festival_title = translate(
                    awards[i].text.split(",")[0].lower().strip())
                tds = tr.find_all("td")
                if len(tds) <= 2:
                    log("Format non conforme " + tr.text)
                else:
                    year = tds[0].text.replace("\n", "").replace(" ",
                                                                 "").strip()
                    award = tds[1].text

                    film = tds[2].find("a")
                    if film and award:
                        win = ("Winner" in award)
                        film_title = film.text
                        if "(" in tds[2].text:
                            film_year = tds[2].text.split("(")[1].split(")")[0]
                            pow = PieceOfWork.objects.filter(
                                title__iexact=film_title,
                                year__iexact=film_year)
                            if pow.exists():
                                pow = pow.first()
                                f = Festival.objects.filter(
                                    title__iexact=festival_title)
                                if f.exists():
                                    f = f.first()
                                else:
                                    f = Festival(title=festival_title)
                                    f.save()

                                a = Award.objects.filter(pow__id=pow.id,
                                                         year=year,
                                                         festival__id=f.id,
                                                         profil__id=profil.id)
                                if a.exists():
                                    a = a.first()
                                else:
                                    award = award.replace("\n", "").replace(
                                        "Winner", "").replace("Nominee",
                                                              "").strip()
                                    if award.startswith("(") and ")" in award:
                                        award = award.split(")")[1]
                                    a = Award(description=award,
                                              year=year,
                                              pow=pow,
                                              festival=f,
                                              profil=profil,
                                              winner=win)
                                try:
                                    a.save()
                                except:
                                    log("!!Probleme d'enregistrement de l'award sur "
                                        + pow.title)

Пример #11

0

Показать файл

Файл: Batch.py Проект: f80dev/DataCulturePro

def extract_film_from_unifrance(url: str,
                                job_for=None,
                                all_casting=False,
                                refresh_delay=30):
    rc = dict({"casting": [], "source": "auto:unifrance", "url": url})
    if not url.startswith("http"):
        log("On passe par la page de recherche pour retrouver le titre")
        page = load_page("https://unifrance.org/recherche?q=" +
                         parse.quote(url),
                         refresh_delay=refresh_delay)
        _link = page.find("a",
                          attrs={
                              'href':
                              wikipedia.re.compile(
                                  "^https://www.unifrance.org/film/[0-9][0-9]")
                          })
        if _link is None: return None

        url = _link.get("href")
        rc["url"] = url

    #r=wikipedia.requests.get(url, headers={'User-Agent': 'Mozilla/5.0',"accept-encoding": "gzip, deflate"})
    #page = wikipedia.BeautifulSoup(str(r.content,encoding="utf-8"),"html5lib")
    page = load_page(url, refresh_delay)
    _title = page.find('h1', attrs={'itemprop': "name"})
    if not _title is None:
        rc["title"] = _title.text
        log("Analyse du film " + rc["title"])

    for title in page.findAll('h1'):
        if title.text.startswith("Affiches"):
            section = title.parent
            _img = section.find("img", attrs={'itemprop': "image"})
            if not _img is None:
                src: str = _img.get("src")
                if not src.startswith("/ressource"):
                    rc["visual"] = src
                    log("Enregistrement de l'affiche " + src)

    _real = page.find("div", attrs={"itemprop": "director"})
    if not _real is None and not _real.find("a", attrs={"itemprop": "name"
                                                        }) is None:
        rc["real"] = _real.find("a", attrs={"itemprop": "name"}).get("href")

    idx_div = 0
    for div in page.findAll("div", attrs={'class': "details_bloc"}):
        if idx_div == 0:
            if not ":" in div.text: rc["nature"] = div.text

        if "Numéro de visa" in div.text:
            rc["visa"] = div.text.split(" : ")[1].replace(".", "")

        if "Langues de tournage" in div.text:
            rc["langue"] = div.text.split(" : ")[1]

        if "Année de production : " in div.text:
            rc["year"] = div.text.replace("Année de production : ", "")
        if "Genre(s) : " in div.text:
            rc["category"] = translate(div.text.replace("Genre(s) : ", ""))
        idx_div = idx_div + 1

    if "category" in rc and len(rc["category"]) == 0:
        rc["category"] = "inconnue"

    rc["prix"] = []
    for section_prix in page.find_all("div",
                                      attrs={"class": "distinction palmares"}):
        if len(section_prix.find_all("div")) > 0:
            content = section_prix.find_all("div")[1].text
            if content is not None:
                content = content.replace("PlusMoins", "")
                _prix = {
                    "description": content.split(")Prix")[1].split(" : ")[0]
                }

                for l in section_prix.find_all("div")[1].find_all("a"):
                    if "festivals" in l.attrs["href"]:
                        _prix["title"] = l.text.split("(")[0]
                        _prix["year"] = re.findall(r"[1-2][0-9]{3}", l.text)[0]
                    if "person" in l.attrs["href"] and "profil" not in _prix:
                        _prix["profil"] = index_string(l.text)

                if not "profil" in _prix:
                    log("Attribution du prix à " + job_for)
                    _prix["profil"] = index_string(job_for)

                if "year" in _prix and "title" in _prix:
                    rc["prix"].append(_prix)
                    log("Ajout du prix " + str(_prix))
                else:
                    log("!Prix non conforme sur " + url)

    if not job_for is None:
        real_links = page.find("div", {
            "id": "description"
        }).find("p").find_all("a")
        if len(real_links) > 0 and equal_str(real_links[0].text, job_for):
            rc["job"] = translate("Réalisation")
        else:
            #Recherche en réalisation
            section = page.find("div", {"itemprop": "director"})
            if section and (job_for.lower() in section.text.lower()):
                rc["job"] = translate("Réalisation")

            #Recherche dans le générique détaillé
            section = page.find("section", {"id": "casting"})
            if not section is None:
                jobs = section.findAll("h2")
                paras = section.findAll("p")
                #if not "personne" in links[0].href:links.remove(0)
                for idx in range(len(paras)):
                    links = paras[idx].findAll("a")
                    for l in links:
                        job = jobs[idx].text.replace(":", "").strip()
                        if "/personne" in l.get("href"):
                            if (job_for.startswith("http")
                                    and l.get("href") == job_for) or equal_str(
                                        job_for, l.text):
                                rc["job"] = job
                                break
                            else:
                                if all_casting:
                                    #On ajoute l'ensemble du casting au systeme
                                    names = str(l.getText()).split(" ")
                                    lastname = names[len(names) - 1]
                                    rc["casting"].append({
                                        "lastname":
                                        lastname,
                                        "url":
                                        l.attrs["href"],
                                        "source":
                                        "unifrance",
                                        "firstname":
                                        l.getText().replace(lastname,
                                                            "").strip(),
                                        "job":
                                        job
                                    })

            #Recherche dans les acteurs
            for actor in page.find_all("div", {"itemprop": "actors"}):
                if "data-title" in actor.attrs:
                    if actor.attrs["data-title"].lower() == job_for.lower():
                        rc["job"] = "actor"

    if not "job" in rc:
        pass

    _synopsis = page.find("div", attrs={"itemprop": "description"})
    if not _synopsis is None:
        rc["synopsis"] = _synopsis.getText(strip=True)

    return rc

Пример #12

0

Показать файл

Файл: Batch.py Проект: f80dev/DataCulturePro

def extract_film_from_leFilmFrancais(url: str,
                                     job_for=None,
                                     all_casting=False,
                                     refresh_delay=30,
                                     bot=None):
    rc = dict({
        "nature": "",
        "title": "",
        "source": "auto:LeFilmFrancais",
        "url": url
    })
    if not url.startswith("http"):
        page = load_page(
            "http://www.lefilmfrancais.com/index.php?option=com_papyrus&view=recherche&searchword="
            + parse.quote(url))
        bFind = False
        fiche_film = page.find("div", {"id": "fiche_film"})
        if fiche_film:
            for l in fiche_film.find_all("a"):
                if l and l["href"].startswith(
                        "http://www.lefilmfrancais.com/film/"):
                    url = l["href"]
                    bFind = True
                    break
        if not bFind: return None

    page = load_page(url, bot=bot)
    if page.find("div", {"id": "synopsis"}):
        rc["synopsis"] = remove_html(page.find("div", {"id": "synopsis"}).text)

    elts = page.find_all("h1")
    if len(elts) > 0:
        rc["title"] = elts[0].text.split("(")[0]

    elt = page.find("div", {"id": "detail"})
    if elt:
        for item in elt:
            if item.name is None:
                if "sortie" in item.lower():
                    pass

    for span in page.find_all("span"):
        if "class" in span.attrs and len(
                span.attrs["class"]
        ) > 0 and span.attrs["class"][0] == "realisation":
            if not "Réalisation" in span.text.split(",")[0]:
                rc["nature"] = span.text.split(",")[0].split("(")[0]
        else:
            if ":" in span.text:
                val = span.text.split(":")[1].strip()
                if "Visa" in span.text: rc["visa"] = val
                if "Titre original" in span.text: rc["original_title"] = val
                if "Réalisation" in span.text: rc["real"] = val
                if "Sortie" in span.text: rc["sortie"] = val
                if "copies" in span.text: rc["copies"] = int(val)
                if "Nationalité" in span.text: rc["Nationality"] = val
                if "Distribution France" in span.text: rc["distribution"] = val

    for item in page.find_all("li"):
        lab = item.text.split(":")[0]
        if ":" in item.text:
            val = item.text.split(":")[1].split("|")[0].strip()
            if "production :" in lab: rc["production"] = val
            if "Partenaires" in lab: rc["financial"] = val
            if "Récompense" in lab: rc["prix"] = val
            if "Presse" in lab: rc["presse"] = val

    if "title" in rc: log("Extraction de " + rc["title"] + " : " + str(rc))
    return rc

Python load_page примеры использования