Exemplo n.º 1
0
 def run(self, args, opts):
     if len(args) != 4:
         raise UsageError()
     jira_id = args[0]
     tracking_filename = args[1]
     data_file = args[2]
     final_method_name = args[3]
     tracking_path = os.path.join(
         self.settings.get('REQUEST_TRACKING_PATH'), jira_id,
         tracking_filename)
     if not FileUtils.isExist(tracking_path):
         print("tracking path is not exists:{0}".format(tracking_path))
         return
     data_path = os.path.join(self.settings.get('STORAGE_DIR'), jira_id,
                              data_file)
     if not FileUtils.isExist(data_path):
         print("data_path path is not exists:{0}".format(data_path))
         return
     final_df = self.__collectMissingUrl(data_path, tracking_path,
                                         final_method_name)
     if not final_df.empty:
         out_path = os.path.join(self.settings.get('REQUEST_TRACKING_PATH'),
                                 jira_id, 'missing_url.csv')
         final_df.to_csv(out_path, sep='|', encoding='utf-8', index=False)
     else:
         print("No Missing URL Captured")
Exemplo n.º 2
0
 def spider_closed(self, spider):
     logger.info("started to Clean the Download Folder in spider %s",
                 spider.name)
     download_path = os.path.join(
         self.settings.get('SELENIUM_DOWNLOAD_PATH', '/tmp'), spider.name)
     if FileUtils.isExist(download_path):
         FileUtils.deletePath(download_path)
Exemplo n.º 3
0
 def run(self, args, opts):
     if len(args) != 1:
         raise UsageError()
     spidername = args[0]
     if args[0] not in self.crawler_process.spider_loader.list():
         print("Spider not available: {}".format(spidername))
         return
     jobDir = os.path.join(self.settings.get('JOB_DIR_PAUSE_RESUME'),
                           spidername)
     if FileUtils.isExist(jobDir):
         FileUtils.deletePath(jobDir)
         print("Job Directory is deleted- {}".format(jobDir))
Exemplo n.º 4
0
    def run(self, args, opts):
        if len(args) != 2:
            raise UsageError()
        jira_id = args[0]
        tracking_filename = args[1]
        tracking_path = os.path.join(self.settings.get('REQUEST_TRACKING_PATH'), jira_id,
                              tracking_filename)
        if not FileUtils.isExist(tracking_path):
            print("tracking path is not exists:{0}".format(tracking_path))
            return
    
        class AnalysisTracking:
            
            def __init__(self, tracking_path, out_path):
                self.tracking_path = tracking_path
                self.out_path = out_path
            
            def analysis(self):
                df = pd.read_csv(self.tracking_path, sep='|')
                df["status"].fillna(0.0, inplace=True)
                request = df.loc[df['type'] == 'request'].reset_index(drop=True)[['unique_id']]
                response = df.loc[df['type'] == 'response'].reset_index(drop=True)[['unique_id', 'status']]
                df = pd.merge(request, response, how='left', on=['unique_id'])
                df["status"].fillna(0.0, inplace=True)
                res = df.groupby(['status']).size().reset_index(name='counts')
                res.to_csv(self.out_path, sep='|', encoding='utf-8',
                           index=False)
                print(res)

        class_analysis = AnalysisTracking(tracking_path,
                                    os.path.join(self.settings.get('REQUEST_TRACKING_PATH'),
                                                                 jira_id,
                              'analysis.csv'))
        class_analysis.analysis()
Exemplo n.º 5
0
 def getSeleniumDriver(self,
                       drivertype=None,
                       executable_path=None,
                       run_headless=False,
                       load_images=True,
                       proxy_string=None,
                       **kwargs):
     download_path = os.path.join(
         self.settings.get('SELENIUM_DOWNLOAD_PATH', '/tmp'), self.name)
     if not FileUtils.isExist(download_path):
         FileUtils.createDir(download_path)
     kwargs['download_path'] = download_path
     self.driverObj = Driver(drivertype)
     driver = None
     try:
         driver = self.driverObj.getDriver(executable_path, run_headless,
                                           load_images, proxy_string,
                                           **kwargs)
     except Exception as e:
         raise SeleniumExtensionsException(
             "problem to get selenium driver-{}", e)
     return driver
Exemplo n.º 6
0
 def spider_opened(self, spider):
     if not self.path or not FileUtils.isExist(self.path):
         self.path = FileUtils.createTempFolder()
         self._settings.overrides['TEMP_FILE_PATH'] = self.path
Exemplo n.º 7
0
 def spider_closed(self, spider):
     if self.path and FileUtils.isExist(self.path):
         FileUtils.deletePath(self.path)
Exemplo n.º 8
0
 def __checkAppendMode(self, name):
     if self.job_dir:
         seenPath = os.path.join(self.job_dir, name, 'requests.seen')
         if FileUtils.isExist(seenPath) and os.stat(seenPath).st_size > 10:
             self.appendMode = True
Exemplo n.º 9
0
 def _genspider(self, jiraid, module, name, domain, requirement_path, url,
                 template_name, template_file, opts):
     headers = self.__headers(requirement_path)
     val_header = headers.get('top_header')
     for k in ['sourceName', 'url', 'ingestion_timestamp']:
         val_header.pop(k)
     tvars = {
         'project_name': self.settings.get('BOT_NAME'),
         'ProjectName': string_camelcase(self.settings.get('BOT_NAME')),
         'module': module,
         'jiraid':jiraid,
         'name': name,
         'start_url':url,
         'username':Utils.getUsername(),
         'datetime':Utils.getCurrentDateTimeStr(),
         'domain': domain,
         'val_header':val_header,
         'ingestion_timestamp':'Utils.getingestion_timestamp()',
         'default_val':{'sourceName':name, 'url':url},
         'null_header':None,
         'feed_expo':None,
         'top_header':None,
         'classname': '%sSpider' % ''.join(s.capitalize() \
             for s in name.split('_'))
     } 
     try:        
         if self.settings.get('NEWSPIDER_MODULE'):
             spiders_module = import_module(self.settings['NEWSPIDER_MODULE'])
             spiders_dir = os.path.join(abspath(dirname(spiders_module.__file__)), jiraid)
             if os.path.exists(spiders_dir):
                 print("Spider %r jiraID already exists in module:" % jiraid)
                 return
             os.mkdir(spiders_dir)
         else:
             spiders_module = None
             spiders_dir = "."
             
         if opts.custom:
             import pprint
             pp = pprint.PrettyPrinter(indent=25, width=250)
             tvars['null_header'] = headers.get('null_header')
             tvars['feed_expo'] = pp.pformat(headers.get('feed_expo'))
             tvars['top_header'] = pp.pformat(headers.get('top_header'))
         spider_file = "%s.py" % join(spiders_dir, name)
         shutil.copyfile(template_file, spider_file)
         render_templatefile(spider_file, **tvars)
         if self.settings['CUSTOM_TEMPLATES_DIR']:
             _template_file = join(self.settings['CUSTOM_TEMPLATES_DIR'], 'items.py.tmpl')
             item_file = "%s.py" % join(spiders_dir, 'items')
             shutil.copyfile(_template_file, item_file)
             render_templatefile(item_file, **tvars)
             __init_file = "%s.py" % join(spiders_dir, '__init__')
             open(__init_file, 'a').close()
             # copy the requirement document in spider folder
             shutil.copyfile(requirement_path, join(spiders_dir, os.path.basename(requirement_path)))
             
         print("Created spider %r using template %r " % (name, \
             template_name), end=('' if spiders_module else '\n'))
         if spiders_module:
             print("in module:\n  %s.%s" % (spiders_module.__name__, module))
     except Exception as e:
         # delete the directory
         if spiders_dir:
             FileUtils.deletePath(spiders_dir)
         print(e)