def convertDir(in_dir, out_dir, lexicon): """ Converts a directory of osis files to usfm :param in_dir: :param out_dir: :param lexicon: :return: """ logger = logging.getLogger(LOGGER_NAME) if os.path.isfile(in_dir): raise Exception('Input must be a directory') input_files = [] for root, dirs, files in os.walk(in_dir): input_files.extend(files) break for file_name in input_files: if file_name.lower() == 'versemap.xml' or (not file_name.endswith('.xml') and not file_name.endswith('.osis')): print('Skipping file {}'.format(file_name)) continue print('Processing {}...'.format(file_name)) usfm = convertFile(os.path.join(in_dir, file_name), lexicon) book_id = os.path.splitext(file_name)[0] book_meta = get_book_by_osis_id(book_id) if book_meta: out_file = os.path.join(out_dir, '{}-{}.usfm'.format(book_meta['sort'], book_meta['usfm_id'])) write_file(out_file, usfm) else: message = 'Missing book meta data for {}'.format(book_id) print(message) logger.error(message)
def _exportToFile(self, path): """ Exports the database to a file :param path: :return: """ write_file(path, json.dumps(self._db))
def build_usx(usfm_dir, usx_dir): """ Builds the usx from usfm after performing some custom processing :param usfm_dir: :param usx_dir: :return: """ # strip word data files = os.listdir(usfm_dir) usfm2_dir = tempfile.mkdtemp(prefix='usfm2') try: for name in files: if name == '.DS_Store': continue f = os.path.join(usfm_dir, name) usfm3 = read_file(f) usfm2 = usfm3_to_usfm2(usfm3) out_f = os.path.join(usfm2_dir, name) write_file(out_f, usfm2) UsfmTransform.buildUSX(usfm2_dir, usx_dir, '', True) finally: try: shutil.rmtree(usfm2_dir) finally: pass
def _prep_text_upload(self, key, data): """ Prepares some data for upload to s3 :param key: :param data: :return: """ temp_file = os.path.join(self.temp_dir, key) write_file(temp_file, data) return {'key': key, 'path': temp_file}
def _prep_json_upload(self, key, data): """ Prepares some data for upload to s3 :param key: :param data: :return: """ temp_file = os.path.join(self.temp_dir, key) write_file(temp_file, json.dumps(data, sort_keys=True)) return {'key': key, 'path': temp_file}
def mapDir(usfm_dir, words_rc, output_dir, global_search=False, map_phrases=True): """ Maps tW to words within each USFM file found in the directory. :param usfm_dir: a directory containing USFM files generated by `csvtousfm3` :param words_rc: the tW resource container :type words_rc: ResourceContainer.RC :param output_dir: a directory where the newly mapped usfm will be saved :param global_search: performs a global word-by-word search in addition to the searcy by occurrence :return: """ usfm_files = [] strongs_index = {} for root, dirs, files in os.walk(usfm_dir): usfm_files.extend(files) break print('Generating occurrences index') location_index = indexWordsLocation(words_rc) category_index = indexWordsCategory(words_rc) if map_phrases: print('Phrase mapping enabled.') if global_search: print('Global search enabled.') print('Generating strongs index.') strongs_index = indexWordByStrongs(words_rc) for file_name in usfm_files: if not file_name.endswith('.usfm'): continue file = os.path.join(usfm_dir, file_name) print('{}'.format(file_name)) usfm = read_file(file) usfm = mapUSFMByOccurrence(usfm=usfm, words_rc=words_rc, words_index=location_index['occurrences'], words_category_index=category_index) if map_phrases: usfm = mapPhrases(usfm) if global_search: usfm = mapUSFMByGlobalSearch( usfm=usfm, words_strongs_index=strongs_index, words_false_positives_index=location_index['false_positives'], words_category_index=category_index) # NOTE: if we need to add phrase mapping to global search un-comment these lines # if map_phrases: # usfm = mapPhrases(usfm) outfile = os.path.join(output_dir, os.path.basename(file)) write_file(outfile, usfm)
def build_usx(usfm_dir, usx_dir): """ Builds the usx from usfm after performing some custom processing :param usfm_dir: :param usx_dir: :return: """ # strip word data files = os.listdir(usfm_dir) for name in files: f = os.path.join(usfm_dir, name) usfm = read_file(f) write_file(f, convert_chunk_markers(strip_word_data(usfm))) UsfmTransform.buildUSX(usfm_dir, usx_dir, '', True)
def build_usx(usfm_dir, usx_dir, logger=None): """ Builds the usx from usfm after performing some custom processing :param usfm_dir: :param usx_dir: :return: """ # strip word data files = os.listdir(usfm_dir) for name in files: f = os.path.join(usfm_dir, name) usfm = read_file(f) write_file(f, remove_unknown_markers(convert_chunk_markers(strip_word_data(usfm)))) if logger: logger.debug("Actual USX conversion into {}".format(usx_dir)) UsfmTransform.buildUSX(usfm_dir, usx_dir, '', True)
def sign_file(self, file_to_sign, private_pem_file=None): """ Generates a .sig file and returns the full file name of the .sig file :param str|unicode file_to_sign: :param str|unicode|None private_pem_file: :return: str|unicode The full file name of the .sig file """ # if pem file was not passed, use the default one if not private_pem_file: private_pem_file = self._default_priv_pem() # use openssl to sign the content sha384_file = file_to_sign + '.sha384' sign_com = 'openssl dgst -sha384 -sign {0} -out {1} {2}'.format( private_pem_file, sha384_file, file_to_sign) command = shlex.split(sign_com) com = Popen(command, shell=False, stdin=PIPE, stdout=PIPE, stderr=PIPE) out, err = com.communicate() if err: raise Exception(err) # base64 encode the signature file_name_without_extension = os.path.splitext(file_to_sign)[0] sig_file_name = '{}.sig'.format(file_name_without_extension) sign_com = 'openssl base64 -in {0} -out {1}'.format( sha384_file, sig_file_name) command = shlex.split(sign_com) com = Popen(command, shell=False, stdin=PIPE, stdout=PIPE, stderr=PIPE) out, err = com.communicate() if err: raise Exception(err) # get the base64 encoded signature with codecs.open(sig_file_name, 'r', encoding='utf-8') as in_file: signed_content = in_file.read() # save the signed content file_content = [] signature = {'si': 'uW', 'sig': signed_content} file_content.append(signature) write_file(sig_file_name, file_content) return sig_file_name
def decrypt_file(source_file_name, destination_file_name): """ Decrypts a file using the AWS encryption key 'signing_key' :param string source_file_name: :param string destination_file_name: :return: bool True if successful, otherwise False """ client = boto3.client('kms') with codecs.open(source_file_name, 'rb') as in_file: data_bytes = in_file.read() response = client.decrypt(CiphertextBlob=data_bytes) if 'Plaintext' not in response: raise Exception( 'File not successfully decrypted: {}'.format(source_file_name)) write_file(destination_file_name, response['Plaintext']) return True
def _run(self): completed_items = 0 items = self.progress_table.query_items() for item in items: repo_name = item['repo_name'] self.logger.info('Processing {}'.format(repo_name)) try: package = json.loads(item['package']) except Exception as e: self.report_error('Skipping {}. Bad Manifest: {}'.format(repo_name, e)) continue if repo_name == "catalogs": self.catalog['catalogs'] = package elif repo_name == 'localization': self._build_localization(package) elif repo_name == 'versification': # TODO: we have not yet determined what to do with versification pass else: if self._build_rc(item, package, self.checker): completed_items += 1 # remove empty languages condensed_languages = [] for lang in self.catalog['languages']: if 'resources' in lang and len(lang['resources']) > 0: condensed_languages.append(lang) self.catalog['languages'] = condensed_languages response = { 'success': False, 'incomplete': len(self.checker.all_errors) > 0, 'message': None, 'catalog': self.catalog } if completed_items > 0: status = self._read_status() if status and status['state'] == 'complete' and not self._catalog_has_changed(self.catalog): response['success'] = True response['message'] = 'No changes detected. Catalog not deployed' else: cat_str = json.dumps(self.catalog, sort_keys=True, separators=(',',':')) try: catalog_path = os.path.join(tempfile.gettempdir(), 'catalog.json') write_file(catalog_path, cat_str) c_stats = os.stat(catalog_path) self.logger.info('New catalog built: {} Kilobytes'.format(c_stats.st_size * 0.001)) self.api_handler.upload_file(catalog_path, 'v{0}/catalog.json'.format(self.api_version), cache_time=0) # TRICKY: only mark as complete when there are no errors if len(self.checker.all_errors): self._publish_status('incomplete') else: self._publish_status() response['success'] = True response['message'] = 'Uploaded new catalog to {0}/v{1}/catalog.json'.format(self.api_url, self.api_version) except Exception as e: self.checker.log_error('Unable to save catalog: {0}'.format(e)) # pragma: no cover if len(self.checker.all_errors) > 0: self.report_error(self.checker.all_errors) if completed_items == 0: self.checker.log_error('There were no formats to process') if not response['success']: response['catalog'] = None response['message'] = '{0}'.format(self.checker.all_errors) if(response['success']): self.logger.info(response['message']) else: self.logger.error('Catalog was not published due to errors') return response
def _strip_print_script(file_to_sign): html = read_file(file_to_sign) html = html.replace('window.print()', '') write_file(file_to_sign, html)
if __name__ == '__main__': parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('-l', '--lang', dest='lang', required=True, help='The language represented in the CSV file') parser.add_argument('-i', '--input', dest='input', required=True, help='CSV file to convert') parser.add_argument('-o', '--output', dest='output', required=True, help='Directory where to save the generated USFM') args = parser.parse_args(sys.argv[1:]) if os.path.isfile(args.output): raise Exception('Output must be a directory') usfm_books = convert(args.lang, args.input) for book in usfm_books: file_path = os.path.join(args.output, '{}-{}.usfm'.format(book['sort'], book['id'])) write_file(file_path, book['usfm'])
def _build_versification(self): """ DEPRECATED we are no longer processing versification. :return: """ bible_dir = os.path.join(self.repo_dir, 'bible') versification_dirs = os.listdir(bible_dir) books = {} package = [] uploads = [] # group by project for vrs_dir in versification_dirs: vrs_id = os.path.basename(vrs_dir) book_files = sorted( glob(os.path.join(bible_dir, vrs_dir, 'chunks', '*.json'))) for b in book_files: self.logger.debug('Reading "{}" versification for "{}"'.format( vrs_id, b)) b_id = os.path.splitext(os.path.basename(b))[0] try: book_vrs = json.loads(read_file(b)) except Exception as e: raise Exception, Exception( 'Bad JSON: {0}'.format(e)), sys.exc_info()[2] book = WebhookHandler.retrieve_or_make( books, b_id, { 'identifier': b_id, 'chunks_url': '{0}/bible/{}/{}/v{}/chunks.json'.format( self.cdn_url, vrs_id, b_id, self.api_version), 'chunks': {} }) book['chunks'][vrs_id] = book_vrs temp_dir = os.path.join(self.temp_dir, 'versification') if not os.path.isdir: os.mkdir(temp_dir) for book in books: book = books[book] # write chunks chunk_file = os.path.join(temp_dir, book['identifier'] + '.json') write_file(chunk_file, json.dumps(book['chunks'], sort_keys=True)) # for now we bypass signing and upload chunks directly upload_key = 'bible/{}/v{}/chunks.json'.format( book['identifier'], self.api_version) uploads.append({'key': upload_key, 'path': chunk_file}) # build package del book['chunks'] package.append(book) return { 'repo_name': self.repo_name, 'commit_id': self.commit_id, 'timestamp': self.timestamp, 'package': json.dumps(package, sort_keys=True), 'uploads': uploads, 'dirty': False }
def process_format(self, item, dublin_core, project, format): """ Performs the signing on the format object. Files outside of the cdn will not be signed :param item: :param dublin_core: :param project: this may be None. :param format: :return: (already_signed, newly_signed) """ if 'signature' in format and format['signature']: return (True, False) else: self.logger.debug('Signing {}'.format(format['url'])) base_name = os.path.basename(format['url']) file_to_sign = os.path.join(self.temp_dir, base_name) # extract cdn key from url url_info = urlparse.urlparse(format['url']) src_key = url_info.path.lstrip('/') sig_key = '{}.sig'.format(src_key) build_rules = get_build_rules(format, 'signing') # TRICKY: allow dev environments to download from prod environment valid_hosts = [self.cdn_bucket] if self.stage_prefix(): if not self.cdn_bucket.startswith(self.stage_prefix()): self.logger.warning( 'Expected `cdn_bucket` to begin with the stage prefix ({}) but found {}' .format(self.stage_prefix(), self.cdn_bucket)) prod_cdn_bucket = self.cdn_bucket.lstrip(self.stage_prefix()) valid_hosts.append(prod_cdn_bucket) # TRICKY: force dev environments to handle prod content as external files # if format['url'].startswith(prod_cdn_url): # build_rules.append('sign_given_url') # TRICKY: some html content is on the api if 'html_format' in build_rules: valid_hosts.append(self.api_bucket) prod_api_bucket = self.api_bucket.lstrip(self.stage_prefix()) valid_hosts.append(prod_api_bucket) # make sure all formats have a media mime type if ('format' not in format or not format['format']) and 'url' in format: quality = '' if 'quality' in format: quality = format['quality'] format['format'] = get_mime_from_url(format['url'], quality) # make sure all formats with a media mime type have a size if ('size' not in format or not format['size'] ) and 'url' in format and 'format' in format and format['format']: format['size'] = get_remote_file_size(format['url']) # verify url is on the cdn if not url_info.hostname in valid_hosts: # TODO: external media should be imported if it's not too big # This allows media to be hosted on third party servers format['signature'] = '' #'{}.sig'.format(format['url']) self.logger.warning( 'cannot sign files outside of the cdn: {}'.format( format['url'])) return (True, True) try: headers = self.url_headers(format['url']) except Exception as e: self.report_error('Could not read headers from {}: {}'.format( format['url'], e)) return (False, False) # skip files that are too large size = int(headers.get('content-length', 0)) if size > SigningHandler.max_file_size: sig_url = '{}.sig'.format(format['url']) if not self._safe_url_exists(sig_url): self.logger.warning('File is too large to sign {}'.format( format['url'])) # TRICKY: spoof the signature sig_file = '{}.sig'.format(file_to_sign) write_file(sig_file, [{'si': 'uW', 'sig': ''}]) format['signature'] = sig_url self.cdn_handler.upload_file(sig_file, sig_key) if not format['modified']: format['modified'] = str_to_timestamp( datetime.datetime.now().isoformat()) # add file format if missing if not 'format' in format or not format['format']: try: _, ext = os.path.splitext(file_to_sign) mime = ext_to_mime(ext) format['format'] = mime except Exception as e: if self.logger: self.logger.error(e.message) return (True, True) # finish with manually uploaded signature format['size'] = size if not format['modified']: format['modified'] = str_to_timestamp( datetime.datetime.now().isoformat()) format['signature'] = sig_url return (False, True) # download file try: if 'sign_given_url' in build_rules or 'html_format' in build_rules: # report error if response is 400+ if headers.status >= 400: self.logger.warning('Resource not available at {}'.format( format['url'])) return (False, False) self.download_file(format['url'], file_to_sign) else: # TRICKY: most files to be signed are stored in a temp directory src_temp_key = 'temp/{}/{}/{}'.format(item['repo_name'], item['commit_id'], src_key) self.cdn_handler.download_file(src_temp_key, file_to_sign) except Exception as e: self.report_error( 'The file "{}" could not be downloaded: {}'.format( base_name, e)) return (False, False) # strip print script from html if 'html_format' in build_rules: self.logger.debug('Removing print script from {} html'.format( item['repo_name'])) self._strip_print_script(file_to_sign) # sign file sig_file = self.signer.sign_file(file_to_sign) try: self.signer.verify_signature(file_to_sign, sig_file) except RuntimeError: if self.logger: self.logger.warning( 'The signature was not successfully verified.') return (False, False) # TRICKY: re-format html urls if 'html_format' in build_rules: html_name = dublin_core['identifier'] if project: html_name = project['identifier'] src_key = '{}/{}/v{}/media/html/{}.html'.format( dublin_core['language']['identifier'], dublin_core['identifier'], self.api_version, html_name) sig_key = '{}.sig'.format(src_key) format['url'] = '{}/{}'.format(self.cdn_url, src_key) # upload files if 'sign_given_url' not in build_rules or 'html_format' in build_rules: # TRICKY: upload temp files to production self.cdn_handler.upload_file(file_to_sign, src_key) self.cdn_handler.upload_file(sig_file, sig_key) # add the url of the sig file to the format format['signature'] = '{}.sig'.format(format['url']) # read modified date from file stats = os.stat(file_to_sign) if not format['modified']: modified = headers.get('last-modified') if modified: # TRICKY: http header gives an odd date format date = datetime.datetime.strptime(modified, '%a, %d %b %Y %H:%M:%S %Z') modified = str_to_timestamp(date.isoformat()) else: modified = unix_to_timestamp(stats.st_mtime) format['modified'] = modified format['size'] = stats.st_size # retrieve playback time from multimedia files _, ext = os.path.splitext(file_to_sign) if ext == '.mp3': audio = MP3(file_to_sign) format['length'] = audio.info.length elif ext == '.mp4': video = MP4(file_to_sign) format['length'] = video.info.length # add file format if missing if not 'format' in format or not format['format']: try: mime = ext_to_mime(ext) format['format'] = mime except Exception as e: if self.logger: self.logger.error(e.message) # clean up disk space os.remove(file_to_sign) return (False, True)