示例#1
0
 def run(self):
     selectorClass = getattr(
         eval(self.config['selector_type']),
         self.config['selector_type'].title() + 'Selector')
     results = dict()
     results['project'] = self.args['<projectname>']
     results['data'] = list()
     try:
         result = dict()
         print()
         print(Back.YELLOW + Fore.BLUE + "Loading page ", self.config['scraping']['url'] \
             + Back.RESET + Fore.RESET)
         selector = selectorClass(self.config['scraping']['url'])
         for attribute in self.config['scraping']['data']:
             if attribute['field'] != "":
                 print("\nExtracting",
                       attribute['field'],
                       "attribute",
                       sep=' ')
                 result[attribute['field']] = selector.extract_content(
                     attribute['selector'], attribute['attr'],
                     attribute['default'])
         if not self.config['scraping'].get('next'):
             results['data'].append(result)
         else:
             for next in self.config['scraping']['next']:
                 for r in traverse_next(selector, next, result):
                     results['data'].append(r)
     except KeyboardInterrupt:
         pass
     except Exception as e:
         print(e)
     finally:
         if self.args['--output_type'] == 'json':
             import json
             with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.json'), \
                 'w') as f:
                 json.dump(results, f)
         elif self.args['--output_type'] == 'csv':
             import csv
             with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.csv'), \
                 'w') as f:
                 fields = extract_fieldnames(self.config)
                 writer = csv.DictWriter(f, fieldnames=fields)
                 writer.writeheader()
                 writer.writerows(results['data'])
         print()
         print(Back.WHITE + Fore.RED + self.args['<output_filename>'], \
               ".", self.args['--output_type'], " has been created" \
               + Back.RESET + Fore.RESET, sep="")
示例#2
0
文件: run.py 项目: Deshklok/scrapple
 def run(self):
     selectorClass = getattr(
             eval(self.config['selector_type']), 
             self.config['selector_type'].title() + 'Selector'
             )
     results = dict()
     results['project'] = self.args['<projectname>']
     results['data'] = list()
     try:
         result = dict()
         print()
         print(Back.YELLOW + Fore.BLUE + "Loading page ", self.config['scraping']['url'] \
             + Back.RESET + Fore.RESET)
         selector = selectorClass(self.config['scraping']['url'])
         for attribute in self.config['scraping']['data']:
             if attribute['field'] != "":
                 print("\nExtracting", attribute['field'], "attribute", sep=' ')
                 result[attribute['field']] = selector.extract_content(attribute['selector'], attribute['attr'], attribute['default'])
         if not self.config['scraping'].get('next'):
             results['data'].append(result)
         else:
             for next in self.config['scraping']['next']:
                 for r in traverse_next(selector, next, result):
                     results['data'].append(r)
     except KeyboardInterrupt:
         pass
     except Exception as e:
         print(e)
     finally:
         if self.args['--output_type'] == 'json':
             import json
             with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.json'), \
                 'w') as f:
                 json.dump(results, f)
         elif self.args['--output_type'] == 'csv':
             import csv
             with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.csv'), \
                 'w') as f:
                 fields = extract_fieldnames(self.config)
                 writer = csv.DictWriter(f, fieldnames=fields)
                 writer.writeheader()
                 writer.writerows(results['data'])
         print()
         print(Back.WHITE + Fore.RED + self.args['<output_filename>'], \
               ".", self.args['--output_type'], " has been created" \
               + Back.RESET + Fore.RESET, sep="")
示例#3
0
文件: run.py 项目: onstash/scrapple
 def run(self):
     selectorClass = getattr(
         eval(self.config['selector_type']),
         self.config['selector_type'].title() + 'Selector')
     results = dict()
     results['project'] = self.args['<projectname>']
     results['data'] = list()
     try:
         result = dict()
         tabular_data_headers = dict()
         if self.args['--verbosity'] > 0:
             print()
             print(Back.YELLOW + Fore.BLUE + "Loading page ", self.config['scraping']['url'] \
                 + Back.RESET + Fore.RESET, end='')
         selector = selectorClass(self.config['scraping']['url'])
         for attribute in self.config['scraping']['data']:
             if attribute['field'] != "":
                 if self.args['--verbosity'] > 1:
                     print("\nExtracting",
                           attribute['field'],
                           "attribute",
                           sep=' ',
                           end='')
                 result[attribute['field']] = selector.extract_content(
                     **attribute)
         if not self.config['scraping'].get('table'):
             result_list = [result]
         else:
             tables = self.config['scraping'].get('table', [])
             for table in tables:
                 if table.get('selector', '').strip() != '':
                     table.update({
                         'result': result,
                         'verbosity': self.args['--verbosity']
                     })
                     table_headers, result_list = selector.extract_tabular(
                         **table)
                     for th in table_headers:
                         if not th in tabular_data_headers:
                             tabular_data_headers[th] = len(
                                 tabular_data_headers)
         if not self.config['scraping'].get('next'):
             results['data'].extend(result_list)
         else:
             for nextx in self.config['scraping']['next']:
                 for tdh, r in traverse_next(
                         selector,
                         nextx,
                         result,
                         verbosity=self.args['--verbosity']):
                     results['data'].append(r)
                     for th in tdh:
                         if not th in tabular_data_headers:
                             tabular_data_headers[th] = len(
                                 tabular_data_headers)
     except KeyboardInterrupt:
         pass
     except Exception as e:
         print(e)
     finally:
         if self.args['--output_type'] == 'json':
             import json
             with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.json'), \
                 'w') as f:
                 json.dump(results, f, indent=3)
         elif self.args['--output_type'] == 'csv':
             import csv
             with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.csv'), \
                 'w') as f:
                 fields = extract_fieldnames(self.config)
                 data_headers = sorted(
                     tabular_data_headers,
                     key=lambda x: tabular_data_headers[x])
                 fields.extend(data_headers)
                 writer = csv.DictWriter(f, fieldnames=fields)
                 writer.writeheader()
                 writer.writerows(results['data'])
         if self.args['--verbosity'] > 0:
             print()
             print(Back.WHITE + Fore.RED + self.args['<output_filename>'], \
                   ".", self.args['--output_type'], " has been created" \
                   + Back.RESET + Fore.RESET, sep="")
示例#4
0
 def run(self):
     selectorClassMapping = {
         'xpath': XpathSelector,
         'css': CssSelector
     }
     selectorClass = selectorClassMapping.get(self.config['selector_type'].lower())
     results = dict()
     results['project'] = self.args['<projectname>']
     results['data'] = list()
     try:
         result = dict()
         tabular_data_headers = dict()
         if self.args['--verbosity'] > 0:
             print()
             print(Back.YELLOW + Fore.BLUE + "Loading page ", self.config['scraping']['url'] \
                 + Back.RESET + Fore.RESET, end='')
         selector = selectorClass(self.config['scraping']['url'])
         for attribute in self.config['scraping']['data']:
             if attribute['field'] != "":
                 if self.args['--verbosity'] > 1:
                     print("\nExtracting", attribute['field'], "attribute", sep=' ', end='')
                 result[attribute['field']] = selector.extract_content(**attribute)
         if not self.config['scraping'].get('table'):
             result_list = [result]
         else:
             tables = self.config['scraping'].get('table', [])
             for table in tables:
                 if table.get('selector', '').strip() != '':
                     table.update({
                         'result': result,
                         'verbosity': self.args['--verbosity']
                     })
                     table_headers, result_list = selector.extract_tabular(**table)
                     for th in table_headers:
                         if not th in tabular_data_headers:
                             tabular_data_headers[th] = len(tabular_data_headers)
         if not self.config['scraping'].get('next'):
             results['data'].extend(result_list)
         else:
             for nextx in self.config['scraping']['next']:
                 for tdh, r in traverse_next(selector, nextx, result, verbosity=self.args['--verbosity']):
                     results['data'].append(r)
                     for th in tdh:
                         if not th in tabular_data_headers:
                             tabular_data_headers[th] = len(tabular_data_headers)
     except KeyboardInterrupt:
         pass
     except Exception as e:
         print(e)
     finally:
         if self.args['--output_type'] == 'json':
             import json
             with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.json'), \
                 'w') as f:
                 json.dump(results, f, indent=4)
         elif self.args['--output_type'] == 'csv':
             import csv
             with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.csv'), \
                 'w') as f:
                 fields = extract_fieldnames(self.config)
                 data_headers = sorted(tabular_data_headers, key=lambda x:tabular_data_headers[x])
                 fields.extend(data_headers)
                 writer = csv.DictWriter(f, fieldnames=fields)
                 writer.writeheader()
                 writer.writerows(results['data'])
         if self.args['--verbosity'] > 0:        
             print()
             print(Back.WHITE + Fore.RED + self.args['<output_filename>'], \
                   ".", self.args['--output_type'], " has been created" \
                   + Back.RESET + Fore.RESET, sep="")
示例#5
0
 def run(self):
     selectorClass = getattr(
             eval(self.config['selector_type']), 
             self.config['selector_type'].title() + 'Selector'
             )
     results = dict()
     results['project'] = self.args['<projectname>']
     results['data'] = list()
     try:
         result = dict()
         tabular_data_headers = dict()
         print()
         print(Back.YELLOW + Fore.BLUE + "Loading page ", self.config['scraping']['url'] \
             + Back.RESET + Fore.RESET)
         selector = selectorClass(self.config['scraping']['url'])
         for attribute in self.config['scraping']['data']:
             if attribute['field'] != "":
                 print("\nExtracting", attribute['field'], "attribute", sep=' ')
                 result[attribute['field']] = selector.extract_content(attribute['selector'], attribute['attr'], attribute['default'])
         if not self.config['scraping'].get('table'):
             result_list = [result]
         else:
             tables = self.config['scraping'].get('table')
             for table in tables:
                 table_headers, result_list = selector.extract_tabular(
                     result=result,
                     table_type=table.get('table_type', 'rows'),
                     header=table.get('header', []),
                     prefix=table.get('prefix', ''),
                     suffix=table.get('suffix', ''),
                     selector=table.get('selector', ''),
                     attr=table.get('attr', 'text'),
                     default=table.get('default', '')
                     )
                 for th in table_headers:
                     if not th in tabular_data_headers:
                         tabular_data_headers[th] = len(tabular_data_headers)
         if not self.config['scraping'].get('next'):
             results['data'].extend(result_list)
         else:
             for next in self.config['scraping']['next']:
                 for tdh, r in traverse_next(selector, next, result):
                     results['data'].append(r)
                     for th in tdh:
                         if not th in tabular_data_headers:
                             tabular_data_headers[th] = len(tabular_data_headers)
     except KeyboardInterrupt:
         pass
     except Exception as e:
         print(e)
     finally:
         if self.args['--output_type'] == 'json':
             import json
             with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.json'), \
                 'w') as f:
                 json.dump(results, f)
         elif self.args['--output_type'] == 'csv':
             import csv
             with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.csv'), \
                 'w') as f:
                 fields = extract_fieldnames(self.config)
                 data_headers = sorted(tabular_data_headers, key=lambda x:tabular_data_headers[x])
                 fields.extend(data_headers)
                 writer = csv.DictWriter(f, fieldnames=fields)
                 writer.writeheader()
                 writer.writerows(results['data'])
         print()
         print(Back.WHITE + Fore.RED + self.args['<output_filename>'], \
               ".", self.args['--output_type'], " has been created" \
               + Back.RESET + Fore.RESET, sep="")