Пример #1
0
 def __init__(self):
     self.downloader = Downloader()
     self.failed_requests = 0
     self.logger = initLogger.getLogger(DownloaderConfig.IMDB_PERSON_DOWNLOADER_LOGGER_NAME)
     self.connector = IMDBPersonStatusConnector()
     self.onepage_limit = DownloaderConfig.IMDB_PERSON_DOWNLOADER_ONEPAGE_REQUESTS_LIMIT
     self.global_limit = DownloaderConfig.IMDB_PERSON_DOWNLOADER_GLOBAL_REQUESTS_LIMIT
     self.logger.debug("IMDB Person Downloader Created")
Пример #2
0
# /usr/bin/env python
# -*- coding: latin-1 -*-

####################################################################


#importe les modules internes
import Logger.init_logger as initLogger #Initialise le logger
import Logger.logger_config as loggerConfig

import FilmExtractor_config as FilmExtractorConfig

from IMDBExtractor.IMDBExtractor import *
from cinema.models import *

logger = initLogger.getLogger(FilmExtractorConfig.IMDB_FILM_EXTRACTOR_LOGGER_NAME)

###################################################################

################################################################
#
#                        IMDB_*Extract Family
#
##################################################################

""" Les fonctions de ce module créent les objets nécessaires à l'extraction et remplissent la DB en appelant les fonction du module FilmExtractor_utils. Il existe une fonction par type de page"""


   

def IMDB_filmExtract(film_id):
Пример #3
0
########################

#importe les modules internes
import Logger.init_logger as initLogger  #Initialise le logger
from Extractor.superExtractor import SuperExtractor  #Charge la super classe

import Logger.logger_config as loggerConfig
import Extractor.extractor_config as extractorConfig

#Importe les modules exterieures à l'application
from lxml import etree
from lxml.html.clean import Cleaner
import StringIO

logger = initLogger.getLogger(extractorConfig.EXTRACTOR_HTML_LOGGER_NAME)
#########################
""" La classe ExtractorHTML herite de la classe SuperExtractor. Elle permet de définir les fonctions nécessaire à l'extraction de données dans un document HTML, dont la chaine de caractère est donnée en paramètre de l'objet"""


class ExtractorHTML(SuperExtractor):
    def __init__(self, htmlString, cleaner):
        SuperExtractor.__init__(self, htmlString)
        self.htmlString = self.string.replace("\n", "").replace("\r", "")

        self.parser = etree.HTMLParser()
        self.cleaner = cleaner
        self.cleanString = self.cleaner.clean_html(self.htmlString)

        #Définit l'arbre sur lequel se feront toutes les XPath extraction
        self.tree = etree.parse(StringIO.StringIO(self.cleanString),
Пример #4
0
from Extractor.extractorHTML import ExtractorHTML

import Extractor.customisedCleaner as CustomCleaner
from cinema.models import *

from Connector.IMDBStatusConnector import *

from FilmExtractor_utils.define_entities import *

import re
import urllib

import random
import codecs

logger = initLogger.getLogger(IMDBExtractorConfig.IMDB_EXTRACTOR_LOGGER_NAME)

###################################################################

import md5

#################################################################


class IMDBExtractor:

   """
      Chaque page nécessite un extractor qui lui est propre : 
          Film 
          Personne (la structure de Actor/Writer/Director est identique) 
          Company 
Пример #5
0


####################################################################


#importe les modules internes
import Logger.init_logger as initLogger #Initialise le logger
import Logger.logger_config as loggerConfig

import FilmExtractor_utils_config as FilmExtractorUtilsConfig

from cinema.models import *


logger = initLogger.getLogger(FilmExtractorUtilsConfig.UTILS_EXTRACTOR_LOGGER_NAME)

###################################################################

##########################################################
#
#                    DEFINE FAMILY
#
##########################################################

"""Crée/Renvoie les objects pour intéragir avec la base de données Django."""

def defineFilm(film_id):
    try :
      f = Film.objects.get(imdb_id=film_id)
      return f
Пример #6
0
from Extractor.extractorHTML import ExtractorHTML
import Extractor.customisedCleaner as CustomCleaner

from Connector.IMDBStatusConnector import IMDBFilmStatusConnector

import spider_config as SpiderConfig

import urllib
from urllib import FancyURLopener

import re

import random

# Logger for this module
logger = initLogger.getLogger(SpiderConfig.IMDB_SPIDER_LOGGER_NAME)


####################################################################

# Custom User-Agent to load IMDB search results

class IMDBSpiderURLopener(FancyURLopener):
    version = "Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11"

urllib._urlopener = IMDBSpiderURLopener()

####################################################################

def searchURL(year, start_pos):
    logger.debug("Compute the url for IMDB search:")
Пример #7
0
                    '--imdb-awards',
                    dest='imdb_awards',
                    help="Si présent, lance l'extraction d'awards des films",
                    action='store_true')
parser.add_argument(
    '-actor',
    '--imdb-actors',
    dest='imdb_actors',
    help="Si présent, lance l'extraction des acteurs des films",
    action='store_true')
#Crée le tableau global qui donne accès aux arguments passés en paramètres sur la ligne de commande
initConfig.args = parser.parse_args()

##########
# Crée les loggers & co
logger = initLogger.getLogger(initConfig.SCRAPER_INIT_LOGGER_NAME)
logger.debug('Logger {} créé'.format(initConfig.SCRAPER_INIT_LOGGER_NAME))

#################
# Vide le fichier de log si demandé
debug_file = initConfig.RUN_TIME_FOLDER + loggerConfig.LOG_FILE
if initConfig.args.fresh_debug:
    logger.info('Vide le fichier {}...'.format(debug_file))
    open(debug_file, 'w').close()

if initConfig.args.imdb_spider:
    logger.info('Lancement du Spider')
    import Spider.IMDBSpider

if initConfig.args.imdb_priority_spider:
    logger.info('Lancement du Spider de Priorités')
Пример #8
0
import Logger.init_logger as initLogger  #Initialise le logger
import Logger.logger_config as loggerConfig

import IMDBExtractor_config as IMDBExtractorConfig

import Connector.IMDBStatusConnector
import FilmExtractor.IMDB_Extractor

from status.models import *
from cinema.models import *

import threading
import time
import random

logger = initLogger.getLogger(
    IMDBExtractorConfig.EXTRACTOR_PERSON_PIC_LOGGER_NAME)

###################################################################

year_min = 1980
year_max = 1989
priority_max = 1000

film_conn = Connector.IMDBStatusConnector.IMDBFilmStatusConnector()
film_id_tab = film_conn.getExtractedFiltered(year_min, year_max, priority_max)
for film_id in film_id_tab:
    logger.debug('Film en cours d extraction : {}'.format(film_id))
    Connector.IMDBStatusConnector.IMDBFilmStatusConnector().setExtractedStatus(
        film_id, "0")
    FilmExtractor.IMDB_Extractor.IMDB_actorsDirectorsExtract(film_id)
    Connector.IMDBStatusConnector.IMDBFilmStatusConnector().setExtractedStatus(
#importe les modules internes
import Logger.init_logger as initLogger #Initialise le logger
import Logger.logger_config as loggerConfig

import FilmExtractor_config as FilmExtractorConfig

import Connector.IMDBStatusConnector
import FilmExtractor.IMDB_Extractor

from status.models import *
from cinema.models import *

import threading
import time
logger = initLogger.getLogger(FilmExtractorConfig.EXTRACTOR_IMDB_INIT_LOGGER_NAME)

###################################################################

year_min=2000
year_max=2012
priority_max=1000

def extractOneMovie(imdb_id):
   FilmExtractor.IMDB_Extractor.IMDB_SuperExtractor(imdb_id) 

def extractOneMovieAwards(imdb_id):
   FilmExtractor.IMDB_Extractor.IMDB_awardsExtract(imdb_id)

def setUnextractedToOneMovie(imdb_id):
   Connector.IMDBStatusConnector.IMDBFilmStatusConnector().setExtractedStatus(imdb_id, "0")
Пример #10
0
# /usr/bin/env python
# -*- coding: latin-1 -*-

####################################################################

#importe les modules internes
import Logger.init_logger as initLogger  #Initialise le logger
import Logger.logger_config as loggerConfig

import FilmExtractor_utils_config as FilmExtractorUtilsConfig

from cinema.models import *

logger = initLogger.getLogger(
    FilmExtractorUtilsConfig.UTILS_EXTRACTOR_LOGGER_NAME)

###################################################################

##########################################################
#
#                    DEFINE FAMILY
#
##########################################################
"""Crée/Renvoie les objects pour intéragir avec la base de données Django."""


def defineFilm(film_id):
    try:
        f = Film.objects.get(imdb_id=film_id)
        return f
    except Film.DoesNotExist:
Пример #11
0
from Extractor.extractorHTML import ExtractorHTML

import Extractor.customisedCleaner as CustomCleaner
from cinema.models import *

from Connector.IMDBStatusConnector import *

from FilmExtractor_utils.define_entities import *

import re
import urllib

import random
import codecs

logger = initLogger.getLogger(IMDBExtractorConfig.IMDB_EXTRACTOR_LOGGER_NAME)

###################################################################

import md5

#################################################################


class IMDBExtractor:
    """
      Chaque page nécessite un extractor qui lui est propre : 
          Film 
          Personne (la structure de Actor/Writer/Director est identique) 
          Company 
          Keyword 
Пример #12
0
 def __init__(self):
     # Logger
     self.logger = initLogger.getLogger(DownloaderConfig.DOWNLOADER_LOGGER_NAME)
     urllib._urlopener = IMDBSpiderURLopener()
Пример #13
0
# /usr/bin/env python
# -*- coding: latin-1 -*-

####################################################################

#importe les modules internes
import Logger.init_logger as initLogger  #Initialise le logger
import Logger.logger_config as loggerConfig

import UserAgent.userAgent_config as userAgentConfig
from urllib import FancyURLopener
import urllib

logger = initLogger.getLogger(userAgentConfig.USER_AGENT_LOGGER_NAME)

###################################################################

####################################################################

# Custom User-Agent to load IMDB search results


class CustomURLopener(FancyURLopener):
    version = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11"


logger.debug('Modification du User Agent')
urllib._urlopener = CustomURLopener()

####################################################################
 def __init__(self):
     self.logger = initLogger.getLogger(
         ConnectorConfig.IMDB_COMPANY_STATUS_CONNECTOR_LOGGER_NAME)
Пример #15
0
# /usr/bin/env python
# -*- coding: latin-1 -*-

####################################################################

#importe les modules internes
import Logger.init_logger as initLogger  #Initialise le logger
import Logger.logger_config as loggerConfig

import FilmExtractor_config as FilmExtractorConfig

from IMDBExtractor.IMDBExtractor import *
from cinema.models import *

logger = initLogger.getLogger(
    FilmExtractorConfig.IMDB_FILM_EXTRACTOR_LOGGER_NAME)

###################################################################

################################################################
#
#                        IMDB_*Extract Family
#
##################################################################
""" Les fonctions de ce module créent les objets nécessaires à l'extraction et remplissent la DB en appelant les fonction du module FilmExtractor_utils. Il existe une fonction par type de page"""


def IMDB_filmExtract(film_id):
    logger.debug(
        "Lancement de l'extraction de la Page film pour le film {}".format(
            film_id))
Пример #16
0
from Extractor.extractorHTML import ExtractorHTML
import Extractor.customisedCleaner as CustomCleaner

from Connector.IMDBStatusConnector import IMDBFilmStatusConnector

import spider_config as SpiderConfig

import urllib
from urllib import FancyURLopener

import re

import random

# Logger for this module
logger = initLogger.getLogger(SpiderConfig.IMDB_SPIDER_LOGGER_NAME)

####################################################################

# Custom User-Agent to load IMDB search results


class IMDBSpiderURLopener(FancyURLopener):
    version = "Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11"


urllib._urlopener = IMDBSpiderURLopener()

####################################################################

Пример #17
0
# -*- coding: latin-1 -*-



####################################################################


#importe les modules internes
import Logger.init_logger as initLogger #Initialise le logger
import Logger.logger_config as loggerConfig

import UserAgent.userAgent_config as userAgentConfig
from urllib import FancyURLopener
import urllib

logger = initLogger.getLogger(userAgentConfig.USER_AGENT_LOGGER_NAME)

###################################################################


####################################################################

# Custom User-Agent to load IMDB search results

class CustomURLopener(FancyURLopener):
    version = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11"

logger.debug('Modification du User Agent')
urllib._urlopener = CustomURLopener()

####################################################################
 def __init__(self):
     self.logger = initLogger.getLogger(ConnectorConfig.IMDB_COMPANY_STATUS_CONNECTOR_LOGGER_NAME)
Пример #19
0
 def __init__(self):
     # Logger
     self.logger = initLogger.getLogger(
         DownloaderConfig.DOWNLOADER_LOGGER_NAME)
     urllib._urlopener = IMDBSpiderURLopener()
Пример #20
0
from downloader import Downloader
import downloader_config as DownloaderConfig

import urllib
from urllib import FancyURLopener

import re

import random

import os

import time

# Logger for this module
logger = initLogger.getLogger(DownloaderConfig.IMDB_DOWNLOADER_LOGGER_NAME)

####################################################################

# Pages local PATHs

def personPath(imdb_id):
    path = "{0}{1}.html".format(DownloaderConfig.IMDB_PERSON_ROOT, imdb_id)
    return path

####################################################################

# Page URLs

def personURL(imdb_id):
    url = "http://www.imdb.com/name/{0}/".format(imdb_id)
Пример #21
0
########################

#importe les modules internes
import Logger.init_logger as initLogger #Initialise le logger
from Extractor.superExtractor import SuperExtractor #Charge la super classe

import Logger.logger_config as loggerConfig
import Extractor.extractor_config as extractorConfig

#Importe les modules exterieures à l'application
from lxml import etree
from lxml.html.clean import Cleaner
import StringIO

logger = initLogger.getLogger(extractorConfig.EXTRACTOR_HTML_LOGGER_NAME)
#########################




""" La classe ExtractorHTML herite de la classe SuperExtractor. Elle permet de définir les fonctions nécessaire à l'extraction de données dans un document HTML, dont la chaine de caractère est donnée en paramètre de l'objet"""

class ExtractorHTML(SuperExtractor):
   def __init__(self,htmlString,cleaner):
      SuperExtractor.__init__(self,htmlString)
      self.htmlString = self.string.replace("\n","").replace("\r","")

      self.parser = etree.HTMLParser()
      self.cleaner = cleaner
      self.cleanString = self.cleaner.clean_html(self.htmlString)
Пример #22
0
#importe les modules internes
import Logger.init_logger as initLogger #Initialise le logger
import Logger.logger_config as loggerConfig

import IMDBExtractor_config as IMDBExtractorConfig

import Connector.IMDBStatusConnector
import FilmExtractor.IMDB_Extractor

from status.models import *
from cinema.models import *

import threading
import time
import random
logger = initLogger.getLogger(IMDBExtractorConfig.EXTRACTOR_PERSON_PIC_LOGGER_NAME)

###################################################################

year_min=1980
year_max=1989
priority_max=1000


film_conn = Connector.IMDBStatusConnector.IMDBFilmStatusConnector()
film_id_tab = film_conn.getExtractedFiltered(year_min,year_max,priority_max)
for film_id in film_id_tab:
   logger.debug('Film en cours d extraction : {}'.format(film_id))
   Connector.IMDBStatusConnector.IMDBFilmStatusConnector().setExtractedStatus(film_id, "0")
   FilmExtractor.IMDB_Extractor.IMDB_actorsDirectorsExtract(film_id)
   Connector.IMDBStatusConnector.IMDBFilmStatusConnector().setExtractedStatus(film_id, "1")
Пример #23
0
parser.add_argument('-imdb_ex', '--imdb-extractor', dest = 'imdb_extractor', help = "Si présent, lance l'extraction des fichiers HTML en provenace de IMDB", action='store_true')
parser.add_argument('-imdb_fsp', '--imdb-spider', dest = 'imdb_spider', help = "Si présent, lance le spider pour IMDB", action='store_true')
parser.add_argument('-imdb_fpsp', '--imdb-priority-spider', dest = 'imdb_priority_spider', help = "Si présent, lance le spider pour les priorités de film IMDB", action='store_true')
parser.add_argument('-imdb_psp', '--imdb-person-spider', dest = 'imdb_person_spider', help = "Si présent, lance le spider pour les personnes IMDB", action='store_true')
parser.add_argument('-imdb_fdw', '--imdb-film-downloader', dest = 'imdb_film_downloader', help = "Si présent, lance le downloader des films IMDB", action='store_true')
parser.add_argument('-imdb_pdw', '--imdb-person-downloader', dest = 'imdb_person_downloader', help = "Si présent, lance le downloader des personnes IMDB", action='store_true')
parser.add_argument('-imdb_cdw', '--imdb-company-downloader', dest = 'imdb_company_downloader', help = "Si présent, lance le downloader des entreprises IMDB", action='store_true')
parser.add_argument('-pic', '--imdb-person-picture', dest = 'imdb_person_picture', help = "Si présent, lance l'extraction d'image des personnes", action='store_true')
parser.add_argument('-aw', '--imdb-awards', dest = 'imdb_awards', help = "Si présent, lance l'extraction d'awards des films", action='store_true')
parser.add_argument('-actor', '--imdb-actors', dest = 'imdb_actors', help = "Si présent, lance l'extraction des acteurs des films", action='store_true')
#Crée le tableau global qui donne accès aux arguments passés en paramètres sur la ligne de commande
initConfig.args = parser.parse_args()

##########
# Crée les loggers & co
logger = initLogger.getLogger (initConfig.SCRAPER_INIT_LOGGER_NAME)
logger.debug('Logger {} créé'.format(initConfig.SCRAPER_INIT_LOGGER_NAME))

#################
# Vide le fichier de log si demandé
debug_file = initConfig.RUN_TIME_FOLDER + loggerConfig.LOG_FILE
if initConfig.args.fresh_debug:
    logger.info ('Vide le fichier {}...'.format(debug_file))
    open(debug_file, 'w').close()

if initConfig.args.imdb_spider:
    logger.info ('Lancement du Spider')
    import Spider.IMDBSpider

if initConfig.args.imdb_priority_spider:
    logger.info ('Lancement du Spider de Priorités')
Пример #24
0
#! /usr/bin/env python
# -*- coding: latin-1 -*-

#verifier que tous les modules peuvent être importés avant de commencer l'application
import Logger.init_logger as initLogger #initialise le logger
import init_config as initConfig

logger = initLogger.getLogger(initConfig.IMPORTS_LOGGER_NAME)


import MySQLdb

try:
   import lxml
except:
   logger.critical("Le module lxml est nécessaire pour l'application, mais n'a pas été trouvé. Installation requise")
   exit(1)
#importe les modules internes
import Logger.init_logger as initLogger  #Initialise le logger
import Logger.logger_config as loggerConfig

import FilmExtractor_config as FilmExtractorConfig

import Connector.IMDBStatusConnector
import FilmExtractor.IMDB_Extractor

from status.models import *
from cinema.models import *

import threading
import time
logger = initLogger.getLogger(
    FilmExtractorConfig.EXTRACTOR_IMDB_INIT_LOGGER_NAME)

###################################################################

year_min = 2000
year_max = 2012
priority_max = 1000


def extractOneMovie(imdb_id):
    FilmExtractor.IMDB_Extractor.IMDB_SuperExtractor(imdb_id)


def extractOneMovieAwards(imdb_id):
    FilmExtractor.IMDB_Extractor.IMDB_awardsExtract(imdb_id)