def identify_date_with_over_allocation(df, Resource, list): daterange = pd.date_range(datetime.today(), datetime.strptime('2020-12-31', "%Y-%m-%d")) # Lets find days that we have over allocated column_names = ['Date', 'Count'] Utilizationdf = pd.DataFrame(columns=column_names) for single_date in daterange: holddf = df[(df['Start_Date'] <= single_date.date()) & (df['Finish_Date'] >= single_date.date())] Utilizationdf = Utilizationdf.append({'Date': single_date.date(), 'Count': len(holddf)}, ignore_index=True) mean = round(np.mean(Utilizationdf['Count'], 0)) print(' '.join(['minimum = ', str(round(np.amin(Utilizationdf['Count'], 0)))])) print(' '.join(['mean =', str(mean)])) print(' '.join(['maximum = ', str(round(np.amax(Utilizationdf['Count'], 0)))])) std = round(np.std(Utilizationdf['Count'], 0)) print(' '.join(['Standard Deviation =', str(std)])) Overutilizationdf = Utilizationdf[(Utilizationdf['Count'] > mean + std)] underutilizationdf = Utilizationdf[(Utilizationdf['Count'] < mean - std)] print('Allocation is higher than mean + 1 standard deviation for ' + df.iloc[ 0].Other_Activity_Resource + ' on the following dates:') print(Overutilizationdf) Overutilizationdf.to_csv(sanitize('_'.join([str(Resource), str(list), 'Overutilizationdf.csv']))) print('Allocation is lower than mean - 1 standard deviation for ' + df.iloc[ 0].Other_Activity_Resource + ' on the following dates:') underutilizationdf.to_csv(sanitize('_'.join([str(Resource), str(list), 'underutilizationdf.csv']))) print(underutilizationdf)
def export(retrieved_annotations: dict, directory: str) -> None: for author in retrieved_annotations: author_directory = "{}/{}/".format(directory, sanitize(author)) Path(author_directory).mkdir(parents=True, exist_ok=True) books = retrieved_annotations[author] for book in books: book_file_name = "{}/{}.docx".format(author_directory, sanitize(book)) document = Document() document.add_heading(book, level=1) document.add_paragraph(author, style='Caption', ) p_blank = document.add_paragraph("") p_blank.line_spacing_rule = WD_LINE_SPACING.DOUBLE chapters = books[book] for chapter in chapters: document.add_paragraph(chapter, style='Title') p_blank = document.add_paragraph("") p_blank.line_spacing_rule = WD_LINE_SPACING.DOUBLE annotations = chapters[chapter] for annotation in annotations: comment = '' if annotation.comment is not None and annotation.comment: comment = annotation.comment p_annotation = document.add_paragraph(annotation.text, style='Intense Quote') p_annotation.alignment = WD_PARAGRAPH_ALIGNMENT.JUSTIFY p_comment = document.add_paragraph(comment, style='No Spacing') p_comment.alignment = WD_PARAGRAPH_ALIGNMENT.JUSTIFY p_blank = document.add_paragraph("") p_blank.line_spacing_rule = WD_LINE_SPACING.DOUBLE document.save(book_file_name)
def put_music(music, response, music_folder=None, iter=None): if (music_folder is None): path = './files/music/{0}'.format(music['id']) else: path = '{0}/{1}'.format(music_folder, music['id']) mkdir_if_not_exists(path) file_name = music['title'] file_name = file_name.replace('/', '|') file_name = sanitize(file_name) file_path = '{0}/{1}.mp3'.format(path, file_name) file = open(file_path, 'wb') file.write(response.content) file.close() print(file_path) print(music['title']) tag = id3.Tag() tag.parse(file_path) try: tag.version = id3.ID3_DEFAULT_VERSION except: tag.version = id3.ID3_V1 artist = tag.artist title = tag.title if title is None: tag.title = music['title'] if artist is None: tag.artist = music['artist'] if len(music['track_covers']): curr_images = [y.description for y in tag.images] for image in curr_images: tag.images.remove(image) img_url = music['track_covers'][len(music['track_covers']) - 1] r = requests.get(img_url) tag.images.set(3, r.content, 'image/jpeg') track_name = '{0}-{1}-{2}.mp3'.format( tag.artist, tag.album if tag.album is not None else '', tag.title) tag.title = track_name try: tag.save() except: tag.version = id3.ID3_V1 tag.save() track_name = track_name.replace('--', '-') track_name = track_name.replace('--', '-') track_name = sanitize(track_name) new_track_path = '{0}/{1}'.format(path, track_name) os.rename(file_path, new_track_path)
def main(): print(request.method) if request.method == 'GET': return render_template('index.html') elif request.method == 'POST': # to convert the images to pdf # extract file data from the request files = request.files.getlist("file[]") if files == []: abort(400, 'Provide at least one image') # use in all the names pdf_name = str(uuid.uuid4())[:18] # save with uuid_name folder_to_save = os.path.join(app.config['UPLOAD_FOLDER'], pdf_name) # file sanitization check for file in files: if sanitize(file.filename).rsplit('.', 1)[1].upper() not in [ 'PNG', 'JPG', 'JPEG', 'JIFF', 'TIFF' ]: abort(400, 'Wrong file type') # creating the folder to save the file if not os.path.exists(folder_to_save): os.mkdir(folder_to_save) # saving the images for image in files: image.save(os.path.join(folder_to_save, sanitize(image.filename))) # formation of pdf get_data from - folder_to_save pdf_size = pdf.create_compressed_pdf(folder_to_save, pdf_name) # save in app.config['UPLOAD_FOLDER'] # save the pdf and delete the folder if os.path.exists(folder_to_save): shutil.rmtree(folder_to_save) # return send_from_directory(directory = app.config['UPLOAD_FOLDER'] ,filename = pdf_name) # (directory = app.config['UPLOAD_FOLDER'] ,filename = pdf_name) # try : return send_from_directory(app.config['UPLOAD_FOLDER'],pdf_name, as_attachment=True) # return send_file(BytesIO(os.path.join(app.config['UPLOAD_FOLDER'],pdf_name) ),as_attachment=True ) file_path = os.path.join(app.config['UPLOAD_FOLDER'], pdf_name + '.pdf') try: return send_file(file_path, mimetype='application/pdf', attachment_filename="Your_small_pdf.pdf", as_attachment=True) except: return jsonify(pdf_name, pdf_size)
def validate_name(self, name): parent = self.context['data']['folder'] method = self.context['request'].method if method in ('PUT', 'PATCH'): def validator(name): return (Folder.objects.filter( folder=parent, name=name).exclude(pk=self.instance.id).exists() or File.objects.filter( folder=parent, name=name).exclude(pk=self.instance.id).exists()) else: def validator(name): return ( Folder.objects.filter(folder=parent, name=name).exists() or File.objects.filter(folder=parent, name=name).exists()) if validator(name): i = 2 base, ext = os.path.splitext(name) new_name = f'{base} ({i}){ext}' while validator(new_name): i += 1 new_name = f'{base} ({i}){ext}' name = new_name return sanitize(name)
def parse(self, response, **kwargs): pagina = {"title": None, "url": response.url, "content": None} contentType = response.headers["Content-Type"].decode("utf-8").lower() if contentType.find("html") != -1: page = response.url.split("/")[-2] title = response.xpath('//main/section/article/h1/text()').get() pagina["title"] = title print(title) self.log(f'Visited page {title}, {page}') # Cerco di prendere solo gli articoli che contengono paragrafi (e quindi probabilmente testo utile) articleContent = response.xpath( "//div[contains(@class, 'article-content') and p]").get() if articleContent: filename = sanitize(title).lower().replace(' ', '-') filename = self.documentsDir + filename + ".json" try: with open(filename, 'x') as file: #Parsing dal contenuto html a testo (markdown) contentToText = self.text_maker.handle(articleContent) pagina["content"] = contentToText json.dump(pagina, file) self.log(f'Saved file {filename}') except FileExistsError: self.log(f'file {filename} already exists, skipping') for href in response.xpath( "//main//a[not(contains(@class, 'bread-parent'))]/@href" ).getall(): #Evito le chiamate alla stessa pagina e ad ancore nella stessa pagina if href != '/' and href != response.url and href[0] != '#': yield scrapy.Request(response.urljoin(href), self.parse) elif contentType == "application/pdf": page = response.url.split("/")[-1] filename = sanitize(page).lower().replace(' ', '-') filename = self.documentsDir + filename if filename.find(".pdf") == -1: filename += ".pdf" try: with open(filename, 'xb') as file: # Scrittura del pdf scaricato file.write(response.body) self.log(f'Saved file {filename}') except FileExistsError: self.log(f'file {filename} already exists, skipping')
def main(): Project_Data_Filename = 'Metro West PETE Schedules.xlsx' """ Main entry point of the app """ logger.info("Starting Resource Tracker") Change_Working_Path('./Data') try: Project_Data_df = Excel_to_Pandas(sanitize(Project_Data_Filename)) except: logger.error('Can not find Project Data file') raise Project_Data_df['Start_Date'] = Project_Data_df['Start_Date'].dt.date Project_Data_df['Finish_Date'] = Project_Data_df['Finish_Date'].dt.date Project_Data_df = Project_Data_df.sort_values(by=['Start_Date']) # Stats about Other Activity Resource districts = ['AMARILLO', 'BIG SPRING', 'FORT WORTH', 'GRAHAM', 'ODESSA', 'SWEETWATER', 'WICHITA FALLS'] # districts = ['FORT WORTH'] # districts = planneddf.Other_Activity_Resource.dropna().unique() # list=['Without Assumptions', 'With Assumptions'] list = [''] for item in list: for district in districts: if item == 'With Assumptions': Project_Data_df.loc[((Project_Data_df['Grandchild'] == 'Electrical Job Planning') & (pd.isnull(Project_Data_df['Other_Activity_Resource'])) & (Project_Data_df['Work_Center_Name'] == district)), [ 'Other_Activity_Resource']] = 'Ft. Worth P&C Crews' Project_Data_df.loc[(Project_Data_df['Grandchild'] == 'Electrical Construction') & (pd.isnull(Project_Data_df['Other_Activity_Resource'])) & (Project_Data_df['Work_Center_Name'] == district), [ 'Other_Activity_Resource']] = 'Ft. Worth P&C Crews' planneddf, resourcemissingdf = filter_Prject_Data_By_Schedule(district, Project_Data_df) # if item == 'Without Assumptions': resourcemissingdf.to_excel(' '.join([district, 'Activities missing resources.xlsx']), district, index=False, engine='xlsxwriter') print_district_stats(planneddf, resourcemissingdf, district) for Resource in planneddf.Other_Activity_Resource.dropna().unique(): print(Resource) DATADF = planneddf[(planneddf['Other_Activity_Resource'] == Resource) & (planneddf['Finish_Date'] > pd.Timestamp(datetime.now())) & (planneddf['Finish_Date'] <= pd.Timestamp( datetime.strptime(str(find_the_counstrunction_Season(datetime.today())), "%Y-%m-%d")))] if DATADF.size > 0: make_gnat(DATADF, ' '.join([DATADF['Other_Activity_Resource'].values[0], 'Utilization', item])) identify_date_with_over_allocation(DATADF, Resource, item)
def export(self, retrieved_annotations: dict, directory: str) -> None: for author in retrieved_annotations: author_directory = "{}/{}/".format(directory, sanitize(author)) Path(author_directory).mkdir(parents=True, exist_ok=True) books = retrieved_annotations[author] for book in books: book_file_name = "{}/{}.txt".format(author_directory, sanitize(book)) book_file = Path(book_file_name).open(mode="a", encoding="utf-16") chapters = books[book] for chapter in chapters: book_file.write("\n\n{}\n".format(chapter)) annotations = chapters[chapter] for annotation in annotations: comment = '' if annotation.comment is not None and annotation.comment: comment = "({})".format(annotation.comment) last_update = "[{}]".format(annotation.last_update) book_file.write("{} - {}: {}\n\n".format( last_update, comment, annotation.text)) book_file.write("\n")
def process_caption(caption, lecture_index, lecture_title, lecture_dir, tries=0): filename = f"%s. %s_%s.%s" % (lecture_index, sanitize(lecture_title), caption.get("locale_id"), caption.get("ext")) filename_no_ext = f"%s. %s_%s" % (lecture_index, sanitize(lecture_title), caption.get("locale_id")) filepath = f"%s\\%s" % (lecture_dir, filename) if os.path.isfile(filepath): print("> Captions '%s' already downloaded." % filename) else: print(f"> Downloading captions: '%s'" % filename) try: download(caption.get("url"), filepath, filename) except Exception as e: if tries >= 3: print( f"> Error downloading captions: {e}. Exceeded retries, skipping." ) return else: print( f"> Error downloading captions: {e}. Will retry {3-tries} more times." ) process_caption(caption, lecture_index, lecture_title, lecture_dir, tries + 1) if caption.get("ext") == "vtt": try: print("> Converting captions to SRT format...") convert(lecture_dir, filename_no_ext) print("> Caption conversion complete.") os.remove(filepath) except Exception as e: print(f"> Error converting captions: {e}")
def parse(data): course_dir = f"%s\\%s" % (download_dir, course_id) if not os.path.exists(course_dir): os.mkdir(course_dir) chapters = [] lectures = [] for obj in data: if obj["_class"] == "chapter": obj["lectures"] = [] chapters.append(obj) elif obj["_class"] == "lecture" and obj["asset"][ "asset_type"] == "Video": try: chapters[-1]["lectures"].append(obj) except IndexError: # This is caused by there not being a starting chapter lectures.append(obj) lecture_index = lectures.index(obj) + 1 lecture_path = f"%s\\%s. %s.mp4" % (course_dir, lecture_index, sanitize(obj["title"])) process_lecture(obj, lecture_index, lecture_path, download_dir) for chapter in chapters: chapter_dir = f"%s\\%s. %s" % (course_dir, chapters.index(chapter) + 1, sanitize(chapter["title"])) if not os.path.exists(chapter_dir): os.mkdir(chapter_dir) for lecture in chapter["lectures"]: lecture_index = chapter["lectures"].index(lecture) + 1 lecture_path = f"%s\\%s. %s.mp4" % (chapter_dir, lecture_index, sanitize(lecture["title"])) process_lecture(lecture, lecture_index, lecture_path, chapter_dir) print("\n\n\n\n\n\n\n\n=====================") print("All downloads completed for course!") print("=====================")
def validate_name(self, name): parent = self.context['data']['folder'] if (Folder.objects.filter(folder=parent, name=name).exists() or File.objects.filter(folder=parent, name=name).exists()): i = 2 new_name = f'{name} ({i})' while (Folder.objects.filter(folder=parent, name=new_name).exists() or File.objects.filter(folder=parent, name=new_name).exists()): i += 1 new_name = f'{name} ({i})' name = new_name return sanitize(name)
def make_gnat(df, title): labels = df.PETE_ID.apply(str) + ' - ' + df.Grandchild.apply(str) length = len(df.index) ticks = [] for x in range(length): ticks.append((x + 1) / length * 100) # Declaring a figure "gnt" fig, gnt = plt.subplots(figsize=(19, 15)) fig.suptitle(title, fontsize=16) # Setting Y-axis limits gnt.set_ylim(0, ticks[-1] + ticks[0]) # Setting X-axis limits gnt.set_xlim(date.today(), datetime.strptime('2020-12-31', "%Y-%m-%d")) # Setting labels for x-axis and y-axis # Setting ticks on y-axis gnt.set_yticks(ticks) # Labelling tickes of y-axis gnt.set_yticklabels(labels) # Setting graph attribute gnt.grid(True) gnt.xaxis_date() # Declaring a bar in schedule # gnt.barh([ticks[0]-ticks[0]/2, (df.Finish_Date.values[0] - df.Start_Date.values[0]), left=df.Start_Date.values[0], height=ticks[0], align='center', color='orange', alpha = 0.8) for x in range(len(df.index)): if df['Start_Date_Planned\Actual'].values[x] == 'A': gnt.barh(ticks[x], (df.Finish_Date.values[x] - df.Start_Date.values[x]), left=df.Start_Date.values[x], height=ticks[0] / 2, align='center', color='maroon', alpha=0.8) else: gnt.barh(ticks[x], (df.Finish_Date.values[x] - df.Start_Date.values[x]), left=df.Start_Date.values[x], height=ticks[0] / 2, align='center', color='red', alpha=0.8) fig.autofmt_xdate() plt.tight_layout() plt.savefig('Output/' + sanitize(title) + '.png')
async def download_and_store( self, url_object: list, session: aiohttp.ClientSession, headers: Optional[CaseInsensitiveDict] = None, show_progress: bool = True ) -> None: """Download the content of given URL and store it in a file.""" url = url_object[0] referal = url_object[1] filename = sanitize(url.split("/")[-1]) if (self.folder / self.title / filename).exists(): logger.debug(str(self.folder / self.title / filename) + " Already Exists") else: logger.debug("Working on " + url) await self.download_file(url, referal=referal, filename=filename, session=session, headers=headers, show_progress=show_progress) await self.rename_file(filename)
def print_jsoncsl(refs): csls = [] for ref in refs: csl = { 'id' : sanitize(ref['objet'] + '-' + ref['ID']), 'type' : 'article-newspaper' if ref['type'] == 'Tribune' else 'personal_communication', 'letterType' : ref['type'], 'abstract' : ref['catchphrase'], 'container-title' : ref['publication'], 'title' : ref['titre'], 'URL' : ref['url'], 'author' : [{'literal':ref['auteurs']}], 'recipient' : [{'literal':ref['destinataire']}], 'issued' : {'raw':ref['date']}, 'note' : ref['position'] } csls.append(csl) print(json.dumps(csls, indent=4, ensure_ascii=False))
def fingerprinter_upload(request): processed_files = [] pdf_file = request.FILES.get('pdf-file') copy_count = request.POST.get('copy-count', 1) suffix = request.POST.get('file-suffix', '') try: copy_count = int(copy_count) except: copy_count = 1 if pdf_file is not None: s = os.path.splitext(pdf_file.name) filename = s[0].replace("'", '').replace('"', '') extension = s[-1] if extension.lower() != '.pdf': raise HTTPExceptions.NOT_ACCEPTABLE #Error code 406 #make save directory rand_path = randword(9) save_path = os.path.join('/tmp/', rand_path) os.makedirs(save_path) filename = sanitize(filename) filename = filename.replace("'", '').replace('"', '') filename = re.sub(r"[\(,\),\s]+", "-", filename) save_temp_file(filename, pdf_file, subdir=rand_path) #trigger fingerprint task task_id = refingerprint_pdf.delay(filename, rand_path, copy_count, suffix) data = {'directory': rand_path, 'filename': filename, 'task_id': str(task_id)} return JsonResponse(data) else: raise Http404('file not provided')
def run(self): if len(self.apiKey) < 32: raise ConfigError('API key not set.') if len(self.appToken) < 32: # https://trello.com/app-key # Request a read-only application token which never expires. url = 'https://trello.com/1/authorize?key={}&name=My+Backup+App&expiration=never&response_type=token&scope=read'.format( self.apiKey) print( 'Application token not set. Please visit {} in your browser to create an application token.' .format(url)) return # Use backup time as folder name. self.backupFolder = 'backups/{}'.format( sanitize(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))) # Fetch all boards. response = requests.get( "https://api.trello.com/1/members/me/boards?key={}&token={}". format(self.apiKey, self.appToken)) boards = json.loads(response.text) if not boards: raise APIError('Error fetching boards. ' + response.text) # Fetch boards in organizations. response = requests.get( "https://api.trello.com/1/members/me/organizations?key={}&token={}" .format(self.apiKey, self.appToken)) orgs = json.loads(response.text) orgsDict = {} for org in orgs: response = requests.get( "https://api.trello.com/1/organizations/{}/boards?&key={}&token={}" .format(org['id'], self.apiKey, self.appToken)) orgBoards = json.loads(response.text) if not orgBoards: raise APIError('Error fetching organization boards. ' + response.text) boards.extend(orgBoards) orgsDict[org['id']] = org # Load content for each board and save it to file. for board in boards: if board['idOrganization'] != None: orgName = orgsDict[ board['idOrganization']]['displayName'] if board[ 'idOrganization'] != None and board[ 'idOrganization'] in orgsDict else board[ 'idOrganization'] else: orgName = "UNKNONWN" print('Fetching board {} in organization {}'.format( board['name'], orgName)) fetchURL = 'https://api.trello.com/1/boards/{}?actions=all&actions_limit=1000&card_attachment_fields=all&cards=all&lists=all&members=all&member_fields=all&card_attachment_fields=all&checklists=all&fields=all&key={}&token={}'.format( board['id'], self.apiKey, self.appToken) response = requests.get(fetchURL) jsonObj = json.loads(response.text) if not jsonObj: raise APIError('Error fetching the content of board "{}". '. format(board['name']) + response.text) fileName = sanitize('org-{}-board-{}.json'.format( orgName, board['name'])) self.save(fileName, response.text) if self.backupAttachments: for action in jsonObj['actions']: # There is attachment data and the attachment has url. if 'attachment' in action['data'] and 'url' in action[ 'data']['attachment']: attachment = action['data']['attachment'] print('>>>>Fetching attachment {}: {}'.format( attachment['id'], attachment['name'])) response = requests.get(attachment['url']) fileName = sanitize('attachment-{}-{}'.format( attachment['id'], attachment['name'])) self.save(fileName, response.text) print( 'Done! {} trello boards have been downloaded and saved in "{}" folder.' .format(len(boards), self.backupFolder))
async def convert_to_pdf(self, md5: str, msg: Message): ack_msg = await msg.reply_text('About to convert book to PDF...', quote=True) book = await BookdlFiles().get_file_by_md5(md5=md5, typ='application/pdf') if book: await BookDLBot.copy_message(chat_id=msg.chat.id, from_chat_id=book['chat_id'], message_id=book['msg_id']) await ack_msg.delete() return _, detail = await Util().get_detail( md5, return_fields=['mirrors', 'title', 'extension', 'coverurl']) temp_dir = Path.joinpath(Common().working_dir, Path(f'{ack_msg.chat.id}+{ack_msg.id}')) if not Path.is_dir(temp_dir): Path.mkdir(temp_dir) direct_links = await LibgenDownload().get_directlink( detail['mirrors']['main']) extension = detail['extension'] params = { 'File': direct_links[1], 'PdfVersion': '2.0', 'OpenZoom': '100', 'PdfTitle': '@SamfunBookdlbot - ' + detail['title'], 'RotatePage': 'ByPage' } stat_var = f"{ack_msg.chat.id}{ack_msg.id}" convert_status[stat_var] = {'Done': False} try: loop = asyncio.get_event_loop() convert_process = loop.run_in_executor(None, self.__convert, params, extension, stat_var) start_time = time.time() while True: if convert_status[stat_var]['Done']: break else: try: await ack_msg.edit_text( f'Convertion to PDF started... {int(time.time() - start_time)}' ) except MessageNotModified as e: logger.error(e) except FloodWait as e: logger.error(e) await asyncio.sleep(e.x) await asyncio.sleep(2) Result = await convert_process except ApiError as e: logger.error(e) await ack_msg.edit_text(e) shutil.rmtree(temp_dir) return file_path = Path.joinpath( temp_dir, Path('[@SamfunBookdlbot] ' + sanitize(detail['title']) + '.pdf')) detail[ 'cost'] = f'ConvertAPI Cost: **{Result.conversion_cost}** seconds.' await ack_msg.edit_text(f'About to download converted file...') try: async with aiohttp.ClientSession() as dl_ses: async with dl_ses.get(Result.file.url) as resp: total_size = int(Result.file.size) file_name = Result.file.filename async with aiofiles.open(file_path, mode="wb") as dl_file: current = 0 logger.info(f'Starting download: {file_name}') start_time = time.time() async for chunk in resp.content.iter_chunked(1024 * 1024): await dl_file.write(chunk) current += len(chunk) if time.time() - start_time > 2: await ack_msg.edit_text( f'Downloading: **{detail["title"]}**\n' f"Status: **{size.format_size(current, binary=True)}** of **{size.format_size(total_size, binary=True)}**" ) start_time = time.time() except Exception as e: logger.exception(e) return None await Uploader().upload_book(file_path, ack_msg, md5, detail=detail)
from os import listdir, rename from os.path import isfile, join from sanitize_filename import sanitize if __name__ == "__main__": target_dir = "./" onlyfiles = [f for f in listdir(target_dir) if isfile(join(target_dir, f))] for file in onlyfiles: sanitized = sanitize(file) if sanitized != file: # we need to rename the file rename(target_dir+file, target_dir+sanitized)
def upload(request): if request.method == 'POST': file_ = request.FILES.get('pdf-file') processing_error = None if file_ is None: raise HTTPExceptions.NOT_ACCEPTABLE #Error code 406 filename = file_.name if not filename or len(filename) < 3 or not '.' in filename: raise SuspiciousFileOperation('improper file name') filename = sanitize(filename) filename = filename.replace("'", '').replace('"', '') filename = re.sub(r"[\(,\),\s]+", "-", filename) temp = filename.split('.') basename = '.'.join(temp[:-1]) extension = temp[-1] if not extension in ('pdf', 'PDF'): raise SuspiciousFileOperation('improper file type') basename = basename[:60] new_filename = '{0}-{1}.{2}'.format(basename, randword(5), extension) #save to /tmp md5_hash, tempfile_path = save_temp_file(new_filename, file_) #file already exists in system? existing_name = check_ocr_file_exists(md5_hash) #already_has_text? if check_pdf_has_text(new_filename): processing_error = 'This PDF already has text. Use the "Force OCR" button to overwrite text with a fresh OCR if desired. If file was OCRd on previous upload those results will be provided' if not existing_name: already_exists = False #upload original to S3 s3 = S3(settings.AWS_OCR_BUCKET) saved_file = open(tempfile_path, 'rb') s3.save_to_bucket(new_filename, saved_file) ref = OCRUpload(filename=new_filename, md5_hash=md5_hash, is_original=True) ref.save() cleanup_temp_file(new_filename) else: already_exists = True new_filename = existing_name cleanup_temp_file(new_filename) data = { 'file_info': { 'filename': filename, 'size': file_.size, 'new_filename': new_filename, 'processing_error': processing_error, 'tempfile_path': tempfile_path, 'already_exists': already_exists, 'md5_hash': md5_hash } } return JsonResponse(data) return HttpResponseNotAllowed(['POST,'])
def get_pod_file_name(self, pod): podPublishedOn = self.get_utc_date(pod.published) podExtension = self.get_pod_file_extension(pod) if "?" in podExtension: podExtension = podExtension.rpartition("?")[0] return sanitize(podPublishedOn.strftime("%Y-%m-%dT%H-%M-%SZ") + "_" + self.configSection + "_" + pod.title + "." + podExtension)
def filename_for(name): return sanitize(name.translate(defang_bad_chars) + '.json')
def normalize(self): """[Update The extension and the week and path of the file] """ self.ext = '.' + self.url.rsplit('.', 1)[1] self.week = sanitize(self.week) self.path = f'{DOWNLOADS_DIR}/{self.course}/{self.week}/{sanitize(self.name)+sanitize(self.ext)}'
def persist(setName: str, cards: List[Dict[str, Any]]) -> None: sanitizedSetName = sanitize(setName) filePath = f"{SetUtil.CARDS_DIR}{os.sep}{sanitizedSetName}" with open(filePath, 'w', encoding='utf-8') as f: f.write(json.dumps(cards))
def ref2filename(ref): return(sanitize( ref['date']+'_'+slugify(ref['titre'])+'_'+slugify(ref['auteurs'])+'.'+ref['upload'][0]['ext'] ))
def upload(request): filename = "" if request.method == 'POST': file_ = request.FILES['file'] filename = file_.name if not filename or len(filename) < 3 or not '.' in filename: raise SuspiciousFileOperation('improper file name') filename = sanitize(filename) filename = filename.replace("'", '').replace('"', '') filename = re.sub(r"[\(,\),\s]+", "-", filename) temp = filename.split('.') basename = '.'.join(temp[:-1]) extension = temp[-1] basename = basename[:60] new_filename = '{0}-{1}.{2}'.format(basename, randword(5), extension) #save file to disk temporarily. #later it will be deleted after uploading to s3. md5_hash, tempfile_path = save_temp_file(new_filename, file_) extension = extension.lower() #if file (or processed child) exists, return the name existing_name = check_file_exists(md5_hash) if existing_name: cleanup_temp_file(new_filename) return HttpResponse(existing_name) #transform process if needed process_to_file_type = False if extension in ['doc', 'docx', 'odt', 'ott', 'rtf', 'odp', 'ppt', 'pptx']: process_to_file_type = 'pdf' if extension in ['xls', 'xlsx', 'ods']: process_to_file_type = 'csv' if process_to_file_type: child_name = _soffice_process( tempfile_path, new_filename, md5_hash, process_to_file_type) if child_name: cleanup_temp_file(child_name) return HttpResponse(child_name) else: cleanup_temp_file(child_name) raise HTTPExceptions.UNPROCESSABLE_ENTITY if extension == 'pdf': #check if is an image pdf or if it has text if not check_pdf_has_text(new_filename): cleanup_temp_file(new_filename) raise HTTPExceptions.NOT_ACCEPTABLE #Error code 406 #upload to cloud s3 = S3(settings.AWS_ANNOTATIONS_BUCKET) saved_file = open(tempfile_path, 'rb') s3.save_to_bucket(new_filename, saved_file) #save ref to db ref = FileUpload(filename=new_filename, md5_hash=md5_hash, extension=extension, is_original=True) ref.save() cleanup_temp_file(new_filename) return HttpResponse(new_filename) return HttpResponseNotAllowed(['POST,'])
def cleanForFileName(string): return sanitize_filename.sanitize(string)
def process_lecture(lecture, lecture_index, lecture_path, lecture_dir): lecture_title = lecture["title"] lecture_asset = lecture["asset"] if not skip_lectures: if lecture_asset["media_license_token"] == None: # not encrypted media_sources = lecture_asset["media_sources"] if quality: # if quality is specified, try to find the requested quality lecture_url = next( (x["src"] for x in media_sources if x["label"] == str(quality)), media_sources[0]["src"] ) # find the quality requested or return the best available else: lecture_url = media_sources[0][ "src"] # best quality is the first index if not os.path.isfile(lecture_path): try: download(lecture_url, lecture_path, lecture_title) except Exception as e: # We could add a retry here print(f"> Error downloading lecture: {e}. Skipping...") else: print(f"> Lecture '%s' is already downloaded, skipping..." % lecture_title) else: # encrypted print(f"> Lecture '%s' has DRM, attempting to download" % lecture_title) lecture_working_dir = "%s\%s" % ( working_dir, lecture_asset["id"] ) # set the folder to download ephemeral files media_sources = lecture_asset["media_sources"] if not os.path.exists(lecture_working_dir): os.mkdir(lecture_working_dir) if not os.path.isfile(lecture_path): mpd_url = next((x["src"] for x in media_sources if x["type"] == "application/dash+xml"), None) if not mpd_url: print( "> Couldn't find dash url for lecture '%s', skipping...", lecture_title) return media_info = manifest_parser(mpd_url) handle_irregular_segments(media_info, lecture_title, lecture_working_dir, lecture_path) cleanup(lecture_working_dir) else: print("> Lecture '%s' is already downloaded, skipping..." % lecture_title) # process assets if dl_assets: assets = [] all_assets = lecture["supplementary_assets"] for asset in all_assets: if asset["asset_type"] == "File": assets.append(asset) asset_filename = asset["filename"] download_url = next((x["file"] for x in asset["download_urls"]["File"] if x["label"] == "download"), None) if download_url: try: download(download_url, f"%s\\%s" % (lecture_dir, asset_filename), asset_filename) except Exception as e: print( f"> Error downloading lecture asset: {e}. Skipping" ) continue elif asset["asset_type"] == "Article": assets.append(asset) asset_path = f"%s\\%s.html" % (lecture_dir, sanitize(lecture_title)) with open(asset_path, 'w') as f: f.write(asset["body"]) elif asset["asset_type"] == "ExternalLink": assets.append(asset) asset_path = f"%s\\%s. External URLs.txt" % (lecture_dir, lecture_index) with open(asset_path, 'a') as f: f.write(f"%s : %s\n" % (asset["title"], asset["external_url"])) print("> Found %s assets for lecture '%s'" % (len(assets), lecture_title)) # process captions if dl_captions: captions = [] for caption in lecture_asset.get("captions"): if not isinstance(caption, dict): continue if caption.get("_class") != "caption": continue download_url = caption.get("url") if not download_url or not isinstance(download_url, str): continue lang = (caption.get("language") or caption.get("srclang") or caption.get("label") or caption.get("locale_id").split("_")[0]) ext = "vtt" if "vtt" in download_url.rsplit(".", 1)[-1] else "srt" if caption_locale == "all" or caption_locale == lang: captions.append({ "language": lang, "locale_id": caption.get("locale_id"), "ext": ext, "url": download_url }) for caption in captions: process_caption(caption, lecture_index, lecture_title, lecture_dir)
def process_download_video(url, path, type): while url == "": print("url cant be empty") url = input("*Paste the link here:") print("Downloading...") if "mp4" in type or type == "": if path == "": video = YouTube(url) name = video.title name = sanitize(name) lst = get_drives() drive = lst[len(lst) - 1] downloaded_video = video.streams.get_highest_resolution().download( ) file = r"" + drive + ":\\" + name + ".mp4" shutil.move(downloaded_video, file) return "Video downloaded: " + file else: try: video = YouTube(url) name = video.title name = sanitize(name) lst = get_drives() downloaded_video = video.streams.get_highest_resolution( ).download() file = r"" + path + "\\" + name + ".mp4" shutil.move(downloaded_video, file) return "Video downloaded: " + file except: print("invalid folder name") return else: if path == "": video = YouTube(url) name = video.title name = sanitize(name) lst = get_drives() drive = lst[len(lst) - 1] downloaded_video = video.streams.get_highest_resolution().download( ) clip = VideoFileClip(downloaded_video) mp3 = downloaded_video.split(".mp4", 1)[0] + f".{type}" audio_clip = clip.audio audio_clip.write_audiofile(mp3, verbose=False, logger=None) audio_clip.close() clip.close() os.remove(downloaded_video) file = r"" + drive + ":\\" + name + f".{type}" shutil.move(mp3, file) return "Audio downloaded: " + file else: try: video = YouTube(url) name = video.title name = sanitize(name) lst = get_drives() downloaded_video = video.streams.get_highest_resolution( ).download() clip = VideoFileClip(downloaded_video) mp3 = downloaded_video.split(".mp4", 1)[0] + f".{type}" audio_clip = clip.audio audio_clip.write_audiofile(mp3, verbose=False, logger=None) audio_clip.close() clip.close() os.remove(downloaded_video) file = r"" + path + "\\" + name + f".{type}" shutil.move(mp3, file) return "Audio downloaded: " + file except: print("invalid folder name") return