def crawl(listen, verbose, outputfile, outputtype): c = Crawler.Crawler(urls=config.URLS) if listen: c.listener = True if verbose: c.verbose = True c.output_file = outputfile c.output_type = outputtype c.run()
import textract if __name__ == '__main__': config = configparser.ConfigParser() config.read('conf.ini') downloads_path = config.get('general', 'downloads_path', fallback='/tmp/downloads/') if not os.path.exists(downloads_path): os.makedirs(downloads_path) elif not os.path.isdir(downloads_path): print('ERROR: downloads_path parameter points to file!') sys.exit(1) crawler = Crawler(config, 'captions') crawler.get(config.get('captions', 'url')) captions_list = [] for row in crawler.get_elements('tbody tr'): items = crawler.get_elements('td', root=row) filename = items[0].text print("Current Filename is {}".format(filename)) url = crawler.get_attr('a', 'href', root=items[0]) crawler.download(url, filename) convert_filename = filename.replace('.pdf', '.txt') os.system("pdftotext '%s' '%s'" % (downloads_path + 'Captions/' + filename, downloads_path + 'Captions/' + convert_filename))
import textract if __name__ == '__main__': config = configparser.ConfigParser() config.read('conf.ini') downloads_path = config.get('general', 'downloads_path', fallback='/tmp/downloads/') if not os.path.exists(downloads_path): os.makedirs(downloads_path) elif not os.path.isdir(downloads_path): print('ERROR: downloads_path parameter points to file!') sys.exit(1) crawler = Crawler(config, 'revenues') crawler.get(config.get('revenues', 'url')) revenue_total_list = [] expenditure_total_list = [] for row in crawler.get_elements('tbody tr'): items = crawler.get_elements('td', root=row) filename = items[0].text print("Current Filename is {}".format(filename)) url = crawler.get_attr('a', 'href', root=items[0]) crawler.download(url, filename) convert_filename = filename.replace('.pdf', '.txt') os.system("pdftotext '%s' '%s'" % (downloads_path + 'Revenues/' + filename,
import sys if __name__ == '__main__': config = configparser.ConfigParser() config.read('conf.ini') downloads_path = config.get('general', 'downloads_path', fallback='/tmp/downloads/') if not os.path.exists(downloads_path): os.makedirs(downloads_path) elif not os.path.isdir(downloads_path): print('ERROR: downloads_path parameter points to file!') sys.exit(1) crawler = Crawler(config, 'assets') crawler.get(config.get('assets', 'url')) asset_total_list = [] liability_total_list = [] for row in crawler.get_elements('tbody tr'): items = crawler.get_elements('td', root=row) filename = items[0].text print("Current Filename is {}".format(filename)) url = crawler.get_attr('a', 'href', root=items[0]) crawler.download(url, filename) convert_filename = filename.replace('.pdf', '.txt') os.system("pdftotext '%s' '%s'" % (downloads_path + 'Assets/' + filename,