def test_simple_download(self): URL = 'http://olps.cgtransport.org/Esewa/img/ESEWA2.jpg' resource_size = 24963 with tempfile.TemporaryDirectory() as temp_dir: logger = self.create_logger('test_simple_download') download_file = os.path.join(temp_dir, 'download_file') downloader = FileDownloader(URL, download_file, logger) downloader.download() self.assertEqual(os.stat(download_file).st_size, resource_size)
def downloadCaptions(self): # Disable Buttons self.disableIOButtons() self.disableDownloadButtons(False) root = self.getOutputFolder() if self.file_downloader == None: self.file_downloader = FileDownloader(root) self.file_downloader.attachGUI(self) self.file_downloader.loadQueueFromList(self.download_queue_captions) self.file_downloader.startDownloadGui()
def download_from_url(url, output_directory, filename=None, use_cache=True): """Download file from a url and put it under output_directory. :param url: Url that gives response. :type url: str :param output_directory: Directory to put the diagram. :type output_directory: str :param filename: Optional filename for downloaded file. :type filename: str :param use_cache: If there is a cached copy of the file already in the output directory, do not refetch it (True) or force refecth it (False). :type use_cache: bool :returns: File path if success to download, else None :rtype: str """ if filename is None: filename = get_filename(url) LOGGER.info('Download file %s from %s' % (filename, url)) file_path = os.path.join(output_directory, filename) if os.path.exists(file_path) and use_cache: LOGGER.info('File %s exists, not downloading' % file_path) return file_path # Set Proxy in webpage proxy = get_proxy() network_manager = QNetworkAccessManager() if not proxy is None: network_manager.setProxy(proxy) # Download Process # noinspection PyTypeChecker downloader = FileDownloader(network_manager, url, file_path) try: result = downloader.download() except IOError as ex: raise DownloadException(ex) if result[0] is not True: _, error_message = result raise DownloadException(error_message) if os.path.exists(file_path): return file_path else: return None
def process_wget(input_data): data = validate_wget(input_data) if data: url = data[0] if data[1]: extension = data[1] # todo: а куда скачивать? file_downloader = FileDownloader(url, extension) file_downloader.download_files() else: # todo : download with url only pass # print(data) # print('op : ' + str(data[0])) # print('url : ' + data[1]) # print('extension : ' + str(data[2])) pass
def run( dst_path, datasetconfig, clean_dst=False, force_download=True, overwrite_current_date=None): try: print('\n> processing', datasetconfig["where_to_download"]["identifier"]) dst_dataset_path = os.path.join(dst_path, datasetconfig["where_to_download"]["identifier"]) if os.path.exists(dst_dataset_path) and clean_dst: shutil.rmtree(dst_dataset_path) # download print('File Downloads:') fs = FileDownloader( dst_dataset_path) fs.process(datasetconfig,force=force_download, current_datetime=overwrite_current_date) except Exception as e: print('exception in run',e) exc_type, exc_value, exc_traceback = sys.exc_info() lines = traceback.format_exception(exc_type, exc_value, exc_traceback) print(''.join(lines))
def start_a_job(self): current_job = None for job in self.job_list: if current_job == None and not job["isQueued"] and not job[ "isFinished"]: self.current_active_jobz += 1 job["isQueued"] = True current_job = job print(f"Current Job is:{current_job}") if current_job != None: message = current_job["message"] downloader = FileDownloader(message, self) # Start the image process by downloading the image downloader.get_file_path()
def read_excel_file(self): workbook = load_workbook(self.file_dir + 'GBP-SBP-resource-centre-250917.xlsx') first_sheet = workbook.get_sheet_names()[0] worksheet = workbook.get_sheet_by_name(first_sheet) for i, row in enumerate(worksheet.iter_rows()): if i == 0: continue if self.es.check_document_exists(index="green_bond", doc="report", id=i): print "Document " + str(i) + " exists!" doc = self.es.find_document(index="green_bond", doc="report", id=i) if row[4].hyperlink is not None and row[ 4].internal_value == doc["_source"][ "external_review_report"]: try: link = row[4].hyperlink.target file_downloader = FileDownloader() file_downloader.download_file(link) text = self.read_pdf_file(self.file_dir + "temp_file.pdf") gbp = GBP(row[0].internal_value, row[1].internal_value, row[2].internal_value, row[3].internal_value, text, row[5].internal_value) self.es.update_document(index="green_bond", doc="report", id=i, data=json.dumps(gbp.__dict__)) print "Update document " + str(i) + " succesfully!" except: print "Hyperlink error! Document " + str(i) pass if row[3].hyperlink is not None and row[ 3].internal_value == doc["_source"][ "external_review_form"]: try: link = row[3].hyperlink.target file_downloader = FileDownloader() file_downloader.download_file(link) text = self.read_pdf_file(self.file_dir + "temp_file.pdf") gbp = GBP( doc["_source"]["green_bond_issuer"], doc["_source"]["country"], doc["_source"]["market_information_template"], text, doc["_source"]["external_review_report"], doc["_source"]["external_link"]) self.es.update_document(index="green_bond", doc="report", id=i, data=json.dumps(gbp.__dict__)) print "Update document " + str(i) + " succesfully!" except: print "Hyperlink error! Document " + str(i) pass continue if row[4].hyperlink is None: gbp = GBP(row[0].internal_value, row[1].internal_value, row[2].internal_value, row[3].internal_value, row[4].internal_value, row[5].internal_value) self.es.add_document(i, json.dumps(gbp.__dict__)) print "Adding document " + str(i) + " succesfully!" else: try: link = row[4].hyperlink.target file_downloader = FileDownloader() file_downloader.download_file(link) self.text = self.read_pdf_file(self.file_dir + "temp_file.pdf") gbp = GBP(row[0].internal_value, row[1].internal_value, row[2].internal_value, row[3].internal_value, self.text, row[5].internal_value) self.es.add_document(i, json.dumps(gbp.__dict__)) print "Adding document " + str(i) + " succesfully!" except: print "Hyperlink error! Document " + str(i) pass
import sys sys.path.append('..') from file_downloader import FileDownloader if __name__=="__main__": FD = FileDownloader('../../conf/global_var.json') batch = [(u'704FCD3C4D673DE631BF985CFEABCB121A4EB7CF1FD135E1C98CFA02268FA035', u'https://s3.amazonaws.com/memex-images/full/bdb9cdde29d5a29411fe7a7dc1ca501a37303431.jpg', {u'_type': u'escorts', u'_timestamp': 1464761131138, u'_index': u'memex-domains_2016.01', u'_score': 0.0, u'fields': {u'obj_parent': [u'2E7904456709725B2B1F3108683D6895FCE0BDF8BFA29E34FD799FF8AAE4CFE5'], u'timestamp': [u'2016-01-26T12:31:11'], u'obj_original_url': [u'http://469-978-3183.escortphonelist.com/images/escort-phone-list.jpg'], u'obj_stored_url': [u'https://s3.amazonaws.com/memex-images/full/bdb9cdde29d5a29411fe7a7dc1ca501a37303431.jpg']}, u'_id': u'704FCD3C4D673DE631BF985CFEABCB121A4EB7CF1FD135E1C98CFA02268FA035'}), (u'3F98B492B2BA98676202271F85D2E3EF0D1E02B9A1C706E76D53C8AAA374494B', u'https://s3.amazonaws.com/memex-images/full/7f667480e65f17e2bec4c3da58bd407d06c4276a.jpg', {u'_type': u'escorts', u'_timestamp': 1464761131138, u'_index': u'memex-domains_2016.01', u'_score': 0.0, u'fields': {u'obj_parent': [u'44D3364D05EF7544A4B8ED36738FAFC9F56651D29D76C930DB73588A5221434B'], u'timestamp': [u'2016-01-26T12:32:20'], u'obj_original_url': [u'http://images.eroticmugshots.com/cities/63/large/21490623-2.jpg'], u'obj_stored_url': [u'https://s3.amazonaws.com/memex-images/full/7f667480e65f17e2bec4c3da58bd407d06c4276a.jpg']}, u'_id': u'3F98B492B2BA98676202271F85D2E3EF0D1E02B9A1C706E76D53C8AAA374494B'}), (u'701DA3E5D25EB3329D710C7181ED6DA9D60D081711822EDA056D29F808E39A95', u'https://s3.amazonaws.com/memex-images/full/219e1c7ba41154f5dada28d1a0bddac280900f0d.jpg', {u'_type': u'escorts', u'_timestamp': 1464761131138, u'_index': u'memex-domains_2016.01', u'_score': 0.0, u'fields': {u'obj_parent': [u'7FE5C4C07A16FFFC22E359BD067D15C816F04BD433950C95767CE81DDB441923'], u'timestamp': [u'2016-01-26T12:32:11'], u'obj_original_url': [u'http://images.eroticmugshots.com/cities/13/thumbnail/34348493-1.jpg'], u'obj_stored_url': [u'https://s3.amazonaws.com/memex-images/full/219e1c7ba41154f5dada28d1a0bddac280900f0d.jpg']}, u'_id': u'701DA3E5D25EB3329D710C7181ED6DA9D60D081711822EDA056D29F808E39A95'}), (u'6D7387E477A206C965EF1535BB83E004147531CE99F9AA14D2B3FD6FC94B22F1', u'https://s3.amazonaws.com/memex-images/full/173446f78bf85d8baf94456c4918e3941e72ee80.jpg', {u'_type': u'escorts', u'_timestamp': 1464761131138, u'_index': u'memex-domains_2016.01', u'_score': 0.0, u'fields': {u'obj_parent': [u'1E61CBB6E97B170EF81926D38E0CFBCD4F31763C1461051447713CF4601E8E7D'], u'timestamp': [u'2016-01-26T12:32:16'], u'obj_original_url': [u'http://images.escortsincollege.com/cities/126/large/10593977-3.jpg'], u'obj_stored_url': [u'https://s3.amazonaws.com/memex-images/full/173446f78bf85d8baf94456c4918e3941e72ee80.jpg']}, u'_id': u'6D7387E477A206C965EF1535BB83E004147531CE99F9AA14D2B3FD6FC94B22F1'}), (u'5B369C1A8E426BC5A5A44FDCAC9D9CE378094967AFEAC184A49A5BAF22E62EF5', u'https://s3.amazonaws.com/roxyimages/5c76b19d1548351973868f9ff97d9c0a52f918cb.jpg', {u'_type': u'escorts', u'_timestamp': 1464761131889, u'_index': u'memex-domains_2016.01', u'_score': 0.0, u'fields': {u'obj_parent': [u'51FEA8D56B4AECCDEFB4469A08DDECD29413334D6575F43B6268CA026E85E586'], u'timestamp': [u'2015-12-17T23:06:11'], u'obj_original_url': [u'http://images4.backpage.com/imager/u/large/125355586/3ef7f93df039c1fa1b6dadfa6c482745.jpg'], u'obj_stored_url': [u'https://s3.amazonaws.com/roxyimages/5c76b19d1548351973868f9ff97d9c0a52f918cb.jpg']}, u'_id': u'5B369C1A8E426BC5A5A44FDCAC9D9CE378094967AFEAC184A49A5BAF22E62EF5'}), (u'0A775A8BDE19A6EA640636F02FD451056D5FDA2300844DCC0AFBC652C02BA37B', u'https://s3.amazonaws.com/roxyimages/3f844e64033e87ca67cfde1154bec0578ee6e2cb.jpg', {u'_type': u'escorts', u'_timestamp': 1464761131889, u'_index': u'memex-domains_2016.01', u'_score': 0.0, u'fields': {u'obj_parent': [u'BD7FB7967E393CEFA88185132FC3106C5DF162F6F63A21AAD6632170F9EDC858'], u'timestamp': [u'2015-12-30T05:58:09'], u'obj_original_url': [u'http://images2.backpage.com/imager/u/large/230328867/01bf2acb8bdf85b6df93aeeb2394c11b.jpg'], u'obj_stored_url': [u'https://s3.amazonaws.com/roxyimages/3f844e64033e87ca67cfde1154bec0578ee6e2cb.jpg']}, u'_id': u'0A775A8BDE19A6EA640636F02FD451056D5FDA2300844DCC0AFBC652C02BA37B'}), (u'CAFB17AB66382E6834B1AB6ED854750054188789532F80DE99B7253B811BDB1F', u'https://s3.amazonaws.com/roxyimages/54f226755c7371c1651c5b0920ca563b89e03860.jpg', {u'_type': u'escorts', u'_timestamp': 1464761131889, u'_index': u'memex-domains_2016.01', u'_score': 0.0, u'fields': {u'obj_parent': [u'1383B4D668C03367999D645EC4418A450AC5FF5D19EE0681E39CCB4E7FA00642'], u'timestamp': [u'2015-12-16T23:58:17'], u'obj_original_url': [u'http://images4.backpage.com/imager/u/large/125277948/f58eca086858632543b39fcd0d2636fd.jpg'], u'obj_stored_url': [u'https://s3.amazonaws.com/roxyimages/54f226755c7371c1651c5b0920ca563b89e03860.jpg']}, u'_id': u'CAFB17AB66382E6834B1AB6ED854750054188789532F80DE99B7253B811BDB1F'}), (u'C0BABA15173709DA7A56207670578312CA30A7E94EDBF68A8140D8D5AA3C20FC', u'https://s3.amazonaws.com/memex-images/full/2d7348b6561c9329f8f8841dc929738ada17e95c.jpg', {u'_type': u'escorts', u'_timestamp': 1464761131973, u'_index': u'memex-domains_2016.01', u'_score': 0.0, u'fields': {u'obj_parent': [u'60C23254F4EADC5B927316AF6596BFC81CBE4187AE00419F7EAB20424F34ED91'], u'timestamp': [u'2016-01-26T13:15:21'], u'obj_original_url': [u'http://img13.asexyservice.com/4/p/200/56a6f1147f975rfRkwq_jZw8Dfd2a72kQGmO6PRwRaWKjXWdp4_200.jpg'], u'obj_stored_url': [u'https://s3.amazonaws.com/memex-images/full/2d7348b6561c9329f8f8841dc929738ada17e95c.jpg']}, u'_id': u'C0BABA15173709DA7A56207670578312CA30A7E94EDBF68A8140D8D5AA3C20FC'}), (u'6687736C0D7E5702DFBA1B94EC8F75E17CE956C5649C4354F5CDD828AEB226D6', u'https://s3.amazonaws.com/memex-images/full/229d071eafac66b81b47c7e7da041c6c91fc9ecf.jpg', {u'_type': u'escorts', u'_timestamp': 1464761131973, u'_index': u'memex-domains_2016.01', u'_score': 0.0, u'fields': {u'obj_parent': [u'5B193E4B99B3C64CA403D0EF4231D8E58554360103DBF069E09C9308B12FEFFC'], u'timestamp': [u'2016-01-26T13:15:18'], u'obj_original_url': [u'http://img7.asexyservice.com/Z/A/200/56a6f114be85a8267wNaP8K32cfsGA_emZr_DrkTMg1mZC84AZ_200.jpg'], u'obj_stored_url': [u'https://s3.amazonaws.com/memex-images/full/229d071eafac66b81b47c7e7da041c6c91fc9ecf.jpg']}, u'_id': u'6687736C0D7E5702DFBA1B94EC8F75E17CE956C5649C4354F5CDD828AEB226D6'}), (u'70520066B7DB04EF9B01858AFC652E973A2098B6039DBDB014063696B0480B82', u'https://s3.amazonaws.com/roxyimages/9bb2b24ccc451467da2e7ae89782b2d9ede6b0c0.jpg', {u'_type': u'escorts', u'_timestamp': 1464761132012, u'_index': u'memex-domains_2016.01', u'_score': 0.0, u'fields': {u'obj_parent': [u'CADCDF5FE92191882A3786FC2A80D3C55F4429F8C35DAA2BB68EC0554EDEEBB2'], u'timestamp': [u'2015-11-17T06:36:19'], u'obj_original_url': [u'http://images2.backpage.com/imager/u/large/222290798/8fd6fdf9abf273e7b235105742f54403.jpg'], u'obj_stored_url': [u'https://s3.amazonaws.com/roxyimages/9bb2b24ccc451467da2e7ae89782b2d9ede6b0c0.jpg']}, u'_id': u'70520066B7DB04EF9B01858AFC652E973A2098B6039DBDB014063696B0480B82'})] readable_images = FD.download_images(batch) print readable_images
from file_downloader import FileDownloader with open('io-data/download-logger.txt', 'a') as log_file: fileDownLoader = FileDownloader(file_with_urls='t-shirt-links.txt', logger=log_file) # fileDownLoader = FileDownloader(file_with_urls='1.txt') fileDownLoader.download_files()
import logging import argparse from file_downloader import FileDownloader #URL1 = 'http://olps.cgtransport.org/Esewa/img/ESEWA2.jpg' if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--url') parser.add_argument('--save_as', type=str, default='download_file') parser.add_argument('--threads', type=int, default=1) parser.add_argument('--verbose', '-v', action='store_true') args = parser.parse_args() if args.verbose: log_level = logging.DEBUG else: log_level = logging.WARNING logger = logging.getLogger('file_downloader_logger') logger.setLevel(log_level) console_handler = logging.StreamHandler() console_handler.setLevel(log_level) formatter = logging.Formatter('%(asctime)s - %(message)s') console_handler.setFormatter(formatter) logger.addHandler(console_handler) downloader = FileDownloader(args.url, args.save_as, logger, args.threads) downloader.download()
def main(self): ''' :return: ''' # Always do these steps regardless of option # Parse variables files to extract relevant sample information # Identify samples to upload to BaseSpace samples_to_upload = identify_samples() # Load and parse out variables from variables files associated with each sample all_variables = load_all_variables(samples_to_upload, os.getcwd()) # Identify the worksheet number which will be used as the project name in BaseSpace worksheet = identify_worksheet(all_variables) # Pair samples- DNA sample is key, RNA sample to look up- if No RNA sample, it is None sample_pairs = create_sample_pairs(all_variables) # Write out sample pairs to log file for checking if needed log.warning(f"sample pairs are {sample_pairs}") # Locate the fastqs associated with all samples all_fastqs = locate_all_fastqs(samples_to_upload, os.getcwd()) # Create a project in BaseSpace- will not create if it already exists, but will still return project id upload = FileUpload(self.authentication_token, worksheet, samples_to_upload, all_fastqs) project = upload.create_basespace_project() log.info(f"Project {worksheet} created") log.warning(f"Project id for project name {worksheet} is {project}") # If whole pipeline required then upload fastq files if not args.tst170 and not args.smp2 and not args.dl_files: # Upload fastq files print(f"uploading fastq files for all samples") upload.upload_files() # Create launch app object for TST170 app launch_tst = LaunchApp(self.authentication_token, worksheet, project, app_name, app_version, sample_pairs) # If resuming from TST170 required or full pipeline- launch the TST170 app if not args.smp2 and not args.dl_files: # Launch TST170 application for each pair in turn # IMPORTANT NOTE: Only processes paired data tst_170 = launch_tst.launch_tst170_pairs() # Dump data to file with open(os.path.join(os.getcwd(), "tst_170.json"), 'w') as t: json.dump(tst_170, t) # If resuming from SMP2v3 launch load in required TST170 data from file elif args.smp2: try: with open(os.path.join(os.getcwd(), "tst_170.json")) as ts: tst_170 = json.load(ts) except FileNotFoundError: raise FileNotFoundError( f"Could not find file tst_170.json. Cannot resume pipeline from SMP2 step." f"Please delete TST170 analysis in BaseSpace and resume pipeline from" f"TST170 stage.") # If resuming from SMP2v3 required, resuming from TST170 required or full pipeline- launch the SMP2 app if not args.dl_files: # Create launch app object for SMP2 v3 if not just downloading files- poll TST170 and when complete # launch SMP2 launch_smp = LaunchApp(self.authentication_token, worksheet, project, smp2_app_name, smp2_app_version, sample_pairs, tst_170) # Poll the tst 170 appsessions until completion, then launch smp2 app smp_appsession = launch_smp.poll_tst170_launch_smp2() # Dump data to file with open(os.path.join(os.getcwd(), "smp.json"), 'w') as s: json.dump(smp_appsession, s) # If downloading files from a completed SMP2 app required # Create a LaunchApp object for smp2 app if flag to only download files is set- allows for polling of SMP2 if args.dl_files: # Load data in required smp2 data from file try: with open(os.path.join(os.getcwd(), "smp.json")) as sm: smp = json.load(sm) except FileNotFoundError: raise FileNotFoundError( f"Could not find file smp.json. Cannot resume pipeline from download step." f"Please delete SMP2 analysis in BaseSpace and resume pipeline from" f"SMP2 stage.") launch_smp = LaunchApp(self.authentication_token, worksheet, project, smp2_app_name, smp2_app_version, sample_pairs, None, smp) # None as tst170 app data not required # Poll the smp appsessions until completion smp_appresults = launch_smp.poll_smp2() # Download all required files- every step requires this file_download = FileDownloader(self.authentication_token, smp_appresults, worksheet) file_download.download_files()
from bs4 import BeautifulSoup from requests.exceptions import RequestException import requests from file_downloader import FileDownloader fileDownLoader = FileDownloader() for page_index in range(1, 110): with open('io-data/log.txt', 'a') as log_file: try: with open('io-data/t-shirt-links.txt', 'a') as t_shirt_links: t_shirt_links.write( 'Starting to process page {0}\n'.format(page_index)) headers = { 'user-agent': 'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' } try: pages_with_data = requests.get( 'https://www.gearbest.com/men-s-t-shirts-c_11926/{0}.html?page_size=120' .format(str(page_index)), headers=headers) # r = requests.get('https://www.threadless.com/catalog/type,guys/style,tees/page,{0}'.format(page_index)) except RequestException as e: print('Error in page {0}'.format(str(page_index))) links_parser = BeautifulSoup(pages_with_data.text) results = links_parser.find_all('a', {'class', 'gbGoodsItem_thumb'})
from flask.ext.sqlalchemy import SQLAlchemy import gevent from gevent import monkey # patches stdlib (including socket and ssl modules) to cooperate with other greenlets monkey.patch_all() app = Flask(__name__) app.config.from_object('config') db = SQLAlchemy(app) import logging from logging.handlers import RotatingFileHandler file_handler = RotatingFileHandler('tmp/downloads.log', 'a', 1 * 1024 * 1024, 10) # file_handler.setFormatter( logging.Formatter( '%(asctime)s %(levelname)s: %(message)s [in %(pathname)s:%(lineno)d]')) app.logger.setLevel(logging.DEBUG) file_handler.setLevel(logging.DEBUG) app.logger.addHandler(file_handler) app.logger.info('===startup downloader===') log = app.logger.info log2 = app.logger.debug from downloads import file_manager from file_downloader import FileDownloader Downloads = FileDownloader()
from PySide2.QtCore import QObject, Signal, Property, QUrl from PySide2.QtGui import QGuiApplication, QIcon from PySide2.QtQml import QQmlApplicationEngine from file_downloader import FileDownloader from program_settings import SettingsManager import resources import sys app = QGuiApplication(sys.argv) app.setWindowIcon(QIcon(':/mi.png')) engine = QQmlApplicationEngine() settings_man = SettingsManager() fd = FileDownloader() engine.rootContext().setContextProperty("filedownloader", fd) engine.rootContext().setContextProperty("sm", settings_man) engine.load(QUrl.fromLocalFile(':/Main.qml')) if not engine.rootObjects(): sys.exit(-1) sys.exit(app.exec_())