def __init__(self, dbpool): self.dbpool = dbpool reload(sys) # 2 sys.setdefaultencoding('utf-8') # 3 logfile = open('testlog.log', 'w') log_observer = ScrapyFileLogObserver(logfile, level=logging.DEBUG) log_observer.start()
def __init__(self, key=None, **kwargs): if key is None: raise Exception("Must specify a spider type!") else: self._type = key print key time = datetime.utcnow() log_path = '/var/log/scrapyd/logs/' # exist = os.path.exists(log_path) # if not exist: # os.makedirs(log_path) logfile = "scrapy_%s_%s_%s.log" % (self.name, self._type,time) logfile = os.path.join(log_path, logfile) print logfile handle = open(logfile, 'w') log_observer = ScrapyFileLogObserver(handle, level=logging.INFO) log_observer.start() error_file = "scrapy_%s_%s_%s_Error.log" % (self.name, self._type, time) error_file = os.path.join(log_path, error_file) error_handle = open(error_file, 'w') error_observer = ScrapyFileLogObserver(error_handle, level=logging.WARNING) error_observer.start() self.key = "%s:%s" % (self.name, self._type) self.lastmodified = datetime.utcnow() # load urls, load last crawled time super(GeneralSitemapSpider, self).__init__(self.name, **kwargs)
def __init__(self, key=None, **kwargs): #fetch general to crawl list here from file or DB if key is None: raise Exception("No start urls selected!") else: print key self._type = key self.start_urls = URL_MAP.get(key) print(self.start_urls) self.rules = RULE_MAP.get(key) print(self.rules) time = datetime.datetime.utcnow() log_path = '/var/log/scrapyd/logs/' logfile = "scrapy_%s_%s_%s.log" % (self.name, self._type, time) logfile = os.path.join(log_path, logfile) print logfile handle = open(logfile, 'w') log_observer = ScrapyFileLogObserver(handle, level=logging.INFO) log_observer.start() error_file = "scrapy_%s_%s_%s_Error.log" % (self.name, self._type, time) error_file = os.path.join(log_path, error_file) error_handle = open(error_file, 'w') error_observer = ScrapyFileLogObserver(error_handle, level=logging.WARNING) error_observer.start() self.first_not_filter = [] #select self_start_urls and self_rules based on the parameter # self.start_urls = ['http://www.yahoo.com'] super(GeneralSpider, self).__init__(self.name, **kwargs)
def runspider(): date = datetime.datetime.utcnow() unix_date = calendar.timegm(date.utctimetuple()) route = request.args.get('route') domain = request.args.get('domain') directory = r"{0}\initiator\static\scrapes\{1}\{2}".format(os.getcwd(), domain, unix_date) if not os.path.exists(directory): os.makedirs(directory) logfile = open('testlog.log', 'w') log_observer = ScrapyFileLogObserver(logfile, level=logging.DEBUG) log_observer.start() log.start(loglevel=logging.DEBUG) dispatcher.connect(stop_reactor, signal=signals.spider_closed) spider = MySpider(route, unix_date) settings_module = importlib.import_module('SiteCrawler.settings') settings = CrawlerSettings(settings_module) crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start() log.msg('Running reactor...') reactor.run() # the script will block here until the spider is closed log.msg('Reactor stopped.') return redirect(url_for('choose_graph', domain = domain, date = unix_date))
def __init__(self, date=None, coursecode=None): if date is None or coursecode is None: self.historical = False # start_url = "http://racing.hkjc.com/racing/Info/meeting/RaceCard/english/Local/" # raise ValueError("Invalid spider parameters") else: self.racedate = date self.racecode = coursecode self.historical = True logfile = open('testlog.log', 'w') log_observer = ScrapyFileLogObserver(logfile, level=logging.DEBUG) log_observer.start()
def __init__(self): #log.start(logfile=time.strftime("log/%Y%m%d%H%M%S")+".log", logstdout=False) #log.start(logfile='log/testlog.log', logstdout=False) logfile = open('log/testlog.log', 'w') log_observer = ScrapyFileLogObserver(logfile, level=logging.DEBUG) log_observer.start() log.msg("initiating crawler...",level=log.INFO) chromedriver = "/Users/starsdeep/tools/chromedriver" os.environ["webdriver.chrome.driver"] = chromedriver self.driver = webdriver.Chrome(chromedriver) self.username = '******' self.password = '******'
from scrapy.log import ScrapyFileLogObserver #date parsing module, not used by default, more info http://code.google.com/p/parsedatetime/ #import parsedatetime.parsedatetime as pdt """Creates the folder if it doesn't exist already'""" FOLDER = './data' try: os.mkdir(FOLDER) except OSError, e: if e.errno != errno.EEXIST: raise Exception("Can't create directory'") """Enables loging into file and to standard output""" logfile = open('%s/google.log' %FOLDER, 'a+b') log_observer = ScrapyFileLogObserver(logfile, level=logging.INFO) log_observer.start() """Google custom search API query parameters Required prameters are: cx - custom search engine unique ID key - unique API key, provides API access q - search query other parameters are optional: filter - 0 disables duplicate content filter (default is 1) sort - date:a - ascending sort by date dateRestrict - w[number] - restrict results to number of weeks more info: https://developers.google.com/custom-search/v1/using_rest#query-params """ PARAMS = { 'dateRestrict' : 'w0',
import sys, os, datetime, errno import logging from scrapy.log import ScrapyFileLogObserver try: import sys, os, datetime, errno today = datetime.datetime.utcnow().strftime("%Y%m%d") logdir = None # ACCUM is the root directory try: accum = os.environ["ACCUM"] except: accum = "/lfs1/users/wat" logdir = os.path.join(accum, "log/escort/%s/www.eros.com/" % today) # ensure log directory exists try: os.makedirs(logdir) except OSError as exception: if exception.errno != errno.EEXIST: raise logfile = open(os.path.join(logdir, "scrapy.log"), 'a') log_observer = ScrapyFileLogObserver(logfile, level=logging.DEBUG) log_observer.start() except Exception as e: print >> sys.stderr, "Failed to create log dir %r [%r]" % (logdir, e) import sys print >> sys.stderr, "SETTINGS: log file %r" % logfile