示例#1
0
def convertDir(in_dir, out_dir, lexicon):
    """
    Converts a directory of osis files to usfm
    :param in_dir:
    :param out_dir:
    :param lexicon:
    :return:
    """
    logger = logging.getLogger(LOGGER_NAME)

    if os.path.isfile(in_dir):
        raise Exception('Input must be a directory')
    input_files = []
    for root, dirs, files in os.walk(in_dir):
        input_files.extend(files)
        break

    for file_name in input_files:
        if file_name.lower() == 'versemap.xml' or (not file_name.endswith('.xml') and not file_name.endswith('.osis')):
            print('Skipping file {}'.format(file_name))
            continue
        print('Processing {}...'.format(file_name))
        usfm = convertFile(os.path.join(in_dir, file_name), lexicon)
        book_id = os.path.splitext(file_name)[0]
        book_meta = get_book_by_osis_id(book_id)
        if book_meta:
            out_file = os.path.join(out_dir, '{}-{}.usfm'.format(book_meta['sort'], book_meta['usfm_id']))
            write_file(out_file, usfm)
        else:
            message = 'Missing book meta data for {}'.format(book_id)
            print(message)
            logger.error(message)
示例#2
0
 def _exportToFile(self, path):
     """
     Exports the database to a file
     :param path:
     :return:
     """
     write_file(path, json.dumps(self._db))
示例#3
0
def build_usx(usfm_dir, usx_dir):
    """
    Builds the usx from usfm after performing some custom processing
    :param usfm_dir:
    :param usx_dir:
    :return:
    """
    # strip word data
    files = os.listdir(usfm_dir)
    usfm2_dir = tempfile.mkdtemp(prefix='usfm2')
    try:
        for name in files:
            if name == '.DS_Store':
                continue
            f = os.path.join(usfm_dir, name)
            usfm3 = read_file(f)
            usfm2 = usfm3_to_usfm2(usfm3)
            out_f = os.path.join(usfm2_dir, name)
            write_file(out_f, usfm2)

        UsfmTransform.buildUSX(usfm2_dir, usx_dir, '', True)
    finally:
        try:
            shutil.rmtree(usfm2_dir)
        finally:
            pass
示例#4
0
 def _prep_text_upload(self, key, data):
     """
     Prepares some data for upload to s3
     :param key:
     :param data:
     :return:
     """
     temp_file = os.path.join(self.temp_dir, key)
     write_file(temp_file, data)
     return {'key': key, 'path': temp_file}
示例#5
0
 def _prep_json_upload(self, key, data):
     """
     Prepares some data for upload to s3
     :param key:
     :param data:
     :return:
     """
     temp_file = os.path.join(self.temp_dir, key)
     write_file(temp_file, json.dumps(data, sort_keys=True))
     return {'key': key, 'path': temp_file}
示例#6
0
def mapDir(usfm_dir,
           words_rc,
           output_dir,
           global_search=False,
           map_phrases=True):
    """
    Maps tW to words within each USFM file found in the directory.
    :param usfm_dir: a directory containing USFM files generated by `csvtousfm3`
    :param words_rc: the tW resource container
    :type words_rc: ResourceContainer.RC
    :param output_dir: a directory where the newly mapped usfm will be saved
    :param global_search: performs a global word-by-word search in addition to the searcy by occurrence
    :return:
    """
    usfm_files = []
    strongs_index = {}
    for root, dirs, files in os.walk(usfm_dir):
        usfm_files.extend(files)
        break

    print('Generating occurrences index')
    location_index = indexWordsLocation(words_rc)
    category_index = indexWordsCategory(words_rc)
    if map_phrases:
        print('Phrase mapping enabled.')
    if global_search:
        print('Global search enabled.')
        print('Generating strongs index.')
        strongs_index = indexWordByStrongs(words_rc)

    for file_name in usfm_files:
        if not file_name.endswith('.usfm'):
            continue

        file = os.path.join(usfm_dir, file_name)
        print('{}'.format(file_name))
        usfm = read_file(file)
        usfm = mapUSFMByOccurrence(usfm=usfm,
                                   words_rc=words_rc,
                                   words_index=location_index['occurrences'],
                                   words_category_index=category_index)
        if map_phrases:
            usfm = mapPhrases(usfm)
        if global_search:
            usfm = mapUSFMByGlobalSearch(
                usfm=usfm,
                words_strongs_index=strongs_index,
                words_false_positives_index=location_index['false_positives'],
                words_category_index=category_index)
            # NOTE: if we need to add phrase mapping to global search un-comment these lines
            # if map_phrases:
            #     usfm = mapPhrases(usfm)
        outfile = os.path.join(output_dir, os.path.basename(file))
        write_file(outfile, usfm)
示例#7
0
def build_usx(usfm_dir, usx_dir):
    """
    Builds the usx from usfm after performing some custom processing
    :param usfm_dir:
    :param usx_dir:
    :return:
    """
    # strip word data
    files = os.listdir(usfm_dir)
    for name in files:
        f = os.path.join(usfm_dir, name)
        usfm = read_file(f)
        write_file(f, convert_chunk_markers(strip_word_data(usfm)))

    UsfmTransform.buildUSX(usfm_dir, usx_dir, '', True)
def build_usx(usfm_dir, usx_dir, logger=None):
    """
    Builds the usx from usfm after performing some custom processing
    :param usfm_dir:
    :param usx_dir:
    :return:
    """
    # strip word data
    files = os.listdir(usfm_dir)
    for name in files:
        f = os.path.join(usfm_dir, name)
        usfm = read_file(f)
        write_file(f, remove_unknown_markers(convert_chunk_markers(strip_word_data(usfm))))

    if logger:
        logger.debug("Actual USX conversion into {}".format(usx_dir))
    UsfmTransform.buildUSX(usfm_dir, usx_dir, '', True)
示例#9
0
    def sign_file(self, file_to_sign, private_pem_file=None):
        """
        Generates a .sig file and returns the full file name of the .sig file
        :param str|unicode file_to_sign:
        :param str|unicode|None private_pem_file:
        :return: str|unicode The full file name of the .sig file
        """
        # if pem file was not passed, use the default one
        if not private_pem_file:
            private_pem_file = self._default_priv_pem()

        # use openssl to sign the content
        sha384_file = file_to_sign + '.sha384'
        sign_com = 'openssl dgst -sha384 -sign {0} -out {1} {2}'.format(
            private_pem_file, sha384_file, file_to_sign)
        command = shlex.split(sign_com)
        com = Popen(command, shell=False, stdin=PIPE, stdout=PIPE, stderr=PIPE)
        out, err = com.communicate()

        if err:
            raise Exception(err)

        # base64 encode the signature
        file_name_without_extension = os.path.splitext(file_to_sign)[0]
        sig_file_name = '{}.sig'.format(file_name_without_extension)
        sign_com = 'openssl base64 -in {0} -out {1}'.format(
            sha384_file, sig_file_name)
        command = shlex.split(sign_com)
        com = Popen(command, shell=False, stdin=PIPE, stdout=PIPE, stderr=PIPE)
        out, err = com.communicate()

        if err:
            raise Exception(err)

        # get the base64 encoded signature
        with codecs.open(sig_file_name, 'r', encoding='utf-8') as in_file:
            signed_content = in_file.read()

        # save the signed content
        file_content = []
        signature = {'si': 'uW', 'sig': signed_content}
        file_content.append(signature)
        write_file(sig_file_name, file_content)

        return sig_file_name
def decrypt_file(source_file_name, destination_file_name):
    """
    Decrypts a file using the AWS encryption key 'signing_key'
    :param string source_file_name:
    :param string destination_file_name:
    :return: bool True if successful, otherwise False
    """
    client = boto3.client('kms')

    with codecs.open(source_file_name, 'rb') as in_file:
        data_bytes = in_file.read()

    response = client.decrypt(CiphertextBlob=data_bytes)

    if 'Plaintext' not in response:
        raise Exception(
            'File not successfully decrypted: {}'.format(source_file_name))

    write_file(destination_file_name, response['Plaintext'])

    return True
    def _run(self):
        completed_items = 0
        items = self.progress_table.query_items()

        for item in items:
            repo_name = item['repo_name']
            self.logger.info('Processing {}'.format(repo_name))
            try:
                package = json.loads(item['package'])
            except Exception as e:
                self.report_error('Skipping {}. Bad Manifest: {}'.format(repo_name, e))
                continue
            if repo_name == "catalogs":
                self.catalog['catalogs'] = package
            elif repo_name == 'localization':
                self._build_localization(package)
            elif repo_name == 'versification':
                # TODO: we have not yet determined what to do with versification
                pass
            else:
                if self._build_rc(item, package, self.checker):
                    completed_items += 1

        # remove empty languages
        condensed_languages = []
        for lang in self.catalog['languages']:
            if 'resources' in lang and len(lang['resources']) > 0:
                condensed_languages.append(lang)
        self.catalog['languages'] = condensed_languages

        response = {
            'success': False,
            'incomplete': len(self.checker.all_errors) > 0,
            'message': None,
            'catalog': self.catalog
        }

        if completed_items > 0:
            status = self._read_status()
            if status and status['state'] == 'complete' and not self._catalog_has_changed(self.catalog):
                response['success'] = True
                response['message'] = 'No changes detected. Catalog not deployed'
            else:
                cat_str = json.dumps(self.catalog, sort_keys=True, separators=(',',':'))
                try:
                    catalog_path = os.path.join(tempfile.gettempdir(), 'catalog.json')
                    write_file(catalog_path, cat_str)
                    c_stats = os.stat(catalog_path)
                    self.logger.info('New catalog built: {} Kilobytes'.format(c_stats.st_size * 0.001))

                    self.api_handler.upload_file(catalog_path, 'v{0}/catalog.json'.format(self.api_version), cache_time=0)
                    # TRICKY: only mark as complete when there are no errors
                    if len(self.checker.all_errors):
                        self._publish_status('incomplete')
                    else:
                        self._publish_status()

                    response['success'] = True
                    response['message'] = 'Uploaded new catalog to {0}/v{1}/catalog.json'.format(self.api_url, self.api_version)
                except Exception as e:
                    self.checker.log_error('Unable to save catalog: {0}'.format(e)) # pragma: no cover

        if len(self.checker.all_errors) > 0:
            self.report_error(self.checker.all_errors)

        if completed_items == 0:
            self.checker.log_error('There were no formats to process')

        if not response['success']:
            response['catalog'] = None
            response['message'] = '{0}'.format(self.checker.all_errors)

        if(response['success']):
            self.logger.info(response['message'])
        else:
            self.logger.error('Catalog was not published due to errors')

        return response
示例#12
0
 def _strip_print_script(file_to_sign):
     html = read_file(file_to_sign)
     html = html.replace('window.print()', '')
     write_file(file_to_sign, html)
示例#13
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('-l',
                        '--lang',
                        dest='lang',
                        required=True,
                        help='The language represented in the CSV file')
    parser.add_argument('-i',
                        '--input',
                        dest='input',
                        required=True,
                        help='CSV file to convert')
    parser.add_argument('-o',
                        '--output',
                        dest='output',
                        required=True,
                        help='Directory where to save the generated USFM')

    args = parser.parse_args(sys.argv[1:])
    if os.path.isfile(args.output):
        raise Exception('Output must be a directory')

    usfm_books = convert(args.lang, args.input)

    for book in usfm_books:
        file_path = os.path.join(args.output,
                                 '{}-{}.usfm'.format(book['sort'], book['id']))
        write_file(file_path, book['usfm'])
示例#14
0
    def _build_versification(self):
        """
        DEPRECATED

        we are no longer processing versification.
        :return:
        """
        bible_dir = os.path.join(self.repo_dir, 'bible')
        versification_dirs = os.listdir(bible_dir)
        books = {}
        package = []
        uploads = []

        # group by project
        for vrs_dir in versification_dirs:
            vrs_id = os.path.basename(vrs_dir)
            book_files = sorted(
                glob(os.path.join(bible_dir, vrs_dir, 'chunks', '*.json')))
            for b in book_files:
                self.logger.debug('Reading "{}" versification for "{}"'.format(
                    vrs_id, b))
                b_id = os.path.splitext(os.path.basename(b))[0]
                try:
                    book_vrs = json.loads(read_file(b))
                except Exception as e:
                    raise Exception, Exception(
                        'Bad JSON: {0}'.format(e)), sys.exc_info()[2]
                book = WebhookHandler.retrieve_or_make(
                    books, b_id, {
                        'identifier':
                        b_id,
                        'chunks_url':
                        '{0}/bible/{}/{}/v{}/chunks.json'.format(
                            self.cdn_url, vrs_id, b_id, self.api_version),
                        'chunks': {}
                    })
                book['chunks'][vrs_id] = book_vrs
        temp_dir = os.path.join(self.temp_dir, 'versification')
        if not os.path.isdir:
            os.mkdir(temp_dir)
        for book in books:
            book = books[book]

            # write chunks
            chunk_file = os.path.join(temp_dir, book['identifier'] + '.json')
            write_file(chunk_file, json.dumps(book['chunks'], sort_keys=True))
            # for now we bypass signing and upload chunks directly
            upload_key = 'bible/{}/v{}/chunks.json'.format(
                book['identifier'], self.api_version)
            uploads.append({'key': upload_key, 'path': chunk_file})

            # build package
            del book['chunks']
            package.append(book)

        return {
            'repo_name': self.repo_name,
            'commit_id': self.commit_id,
            'timestamp': self.timestamp,
            'package': json.dumps(package, sort_keys=True),
            'uploads': uploads,
            'dirty': False
        }
示例#15
0
    def process_format(self, item, dublin_core, project, format):
        """
        Performs the signing on the format object.
        Files outside of the cdn will not be signed
        :param item:
        :param dublin_core:
        :param project: this may be None.
        :param format:
        :return: (already_signed, newly_signed)
        """
        if 'signature' in format and format['signature']:
            return (True, False)
        else:
            self.logger.debug('Signing {}'.format(format['url']))

        base_name = os.path.basename(format['url'])
        file_to_sign = os.path.join(self.temp_dir, base_name)

        # extract cdn key from url
        url_info = urlparse.urlparse(format['url'])
        src_key = url_info.path.lstrip('/')
        sig_key = '{}.sig'.format(src_key)

        build_rules = get_build_rules(format, 'signing')

        # TRICKY: allow dev environments to download from prod environment
        valid_hosts = [self.cdn_bucket]
        if self.stage_prefix():
            if not self.cdn_bucket.startswith(self.stage_prefix()):
                self.logger.warning(
                    'Expected `cdn_bucket` to begin with the stage prefix ({}) but found {}'
                    .format(self.stage_prefix(), self.cdn_bucket))
            prod_cdn_bucket = self.cdn_bucket.lstrip(self.stage_prefix())
            valid_hosts.append(prod_cdn_bucket)
            # TRICKY: force dev environments to handle prod content as external files
            # if format['url'].startswith(prod_cdn_url):
            #     build_rules.append('sign_given_url')

        # TRICKY: some html content is on the api
        if 'html_format' in build_rules:
            valid_hosts.append(self.api_bucket)
            prod_api_bucket = self.api_bucket.lstrip(self.stage_prefix())
            valid_hosts.append(prod_api_bucket)

        # make sure all formats have a media mime type
        if ('format' not in format
                or not format['format']) and 'url' in format:
            quality = ''
            if 'quality' in format:
                quality = format['quality']
            format['format'] = get_mime_from_url(format['url'], quality)

        # make sure all formats with a media mime type have a size
        if ('size' not in format or not format['size']
            ) and 'url' in format and 'format' in format and format['format']:
            format['size'] = get_remote_file_size(format['url'])

        # verify url is on the cdn
        if not url_info.hostname in valid_hosts:
            # TODO: external media should be imported if it's not too big
            # This allows media to be hosted on third party servers
            format['signature'] = ''  #'{}.sig'.format(format['url'])
            self.logger.warning(
                'cannot sign files outside of the cdn: {}'.format(
                    format['url']))
            return (True, True)

        try:
            headers = self.url_headers(format['url'])
        except Exception as e:
            self.report_error('Could not read headers from {}: {}'.format(
                format['url'], e))
            return (False, False)

        # skip files that are too large
        size = int(headers.get('content-length', 0))
        if size > SigningHandler.max_file_size:
            sig_url = '{}.sig'.format(format['url'])
            if not self._safe_url_exists(sig_url):
                self.logger.warning('File is too large to sign {}'.format(
                    format['url']))

                # TRICKY: spoof the signature
                sig_file = '{}.sig'.format(file_to_sign)
                write_file(sig_file, [{'si': 'uW', 'sig': ''}])
                format['signature'] = sig_url
                self.cdn_handler.upload_file(sig_file, sig_key)

                if not format['modified']:
                    format['modified'] = str_to_timestamp(
                        datetime.datetime.now().isoformat())

                # add file format if missing
                if not 'format' in format or not format['format']:
                    try:
                        _, ext = os.path.splitext(file_to_sign)
                        mime = ext_to_mime(ext)
                        format['format'] = mime
                    except Exception as e:
                        if self.logger:
                            self.logger.error(e.message)

                return (True, True)

            # finish with manually uploaded signature
            format['size'] = size
            if not format['modified']:
                format['modified'] = str_to_timestamp(
                    datetime.datetime.now().isoformat())
            format['signature'] = sig_url
            return (False, True)

        # download file
        try:
            if 'sign_given_url' in build_rules or 'html_format' in build_rules:
                # report error if response is 400+
                if headers.status >= 400:
                    self.logger.warning('Resource not available at {}'.format(
                        format['url']))
                    return (False, False)

                self.download_file(format['url'], file_to_sign)
            else:
                # TRICKY: most files to be signed are stored in a temp directory
                src_temp_key = 'temp/{}/{}/{}'.format(item['repo_name'],
                                                      item['commit_id'],
                                                      src_key)
                self.cdn_handler.download_file(src_temp_key, file_to_sign)
        except Exception as e:
            self.report_error(
                'The file "{}" could not be downloaded: {}'.format(
                    base_name, e))
            return (False, False)

        # strip print script from html
        if 'html_format' in build_rules:
            self.logger.debug('Removing print script from {} html'.format(
                item['repo_name']))
            self._strip_print_script(file_to_sign)

        # sign file
        sig_file = self.signer.sign_file(file_to_sign)
        try:
            self.signer.verify_signature(file_to_sign, sig_file)
        except RuntimeError:
            if self.logger:
                self.logger.warning(
                    'The signature was not successfully verified.')
            return (False, False)

        # TRICKY: re-format html urls
        if 'html_format' in build_rules:
            html_name = dublin_core['identifier']
            if project:
                html_name = project['identifier']
            src_key = '{}/{}/v{}/media/html/{}.html'.format(
                dublin_core['language']['identifier'],
                dublin_core['identifier'], self.api_version, html_name)
            sig_key = '{}.sig'.format(src_key)
            format['url'] = '{}/{}'.format(self.cdn_url, src_key)

        # upload files
        if 'sign_given_url' not in build_rules or 'html_format' in build_rules:
            # TRICKY: upload temp files to production
            self.cdn_handler.upload_file(file_to_sign, src_key)
        self.cdn_handler.upload_file(sig_file, sig_key)

        # add the url of the sig file to the format
        format['signature'] = '{}.sig'.format(format['url'])

        # read modified date from file
        stats = os.stat(file_to_sign)
        if not format['modified']:
            modified = headers.get('last-modified')
            if modified:
                # TRICKY: http header gives an odd date format
                date = datetime.datetime.strptime(modified,
                                                  '%a, %d %b %Y %H:%M:%S %Z')
                modified = str_to_timestamp(date.isoformat())
            else:
                modified = unix_to_timestamp(stats.st_mtime)
            format['modified'] = modified
        format['size'] = stats.st_size

        # retrieve playback time from multimedia files
        _, ext = os.path.splitext(file_to_sign)
        if ext == '.mp3':
            audio = MP3(file_to_sign)
            format['length'] = audio.info.length
        elif ext == '.mp4':
            video = MP4(file_to_sign)
            format['length'] = video.info.length

        # add file format if missing
        if not 'format' in format or not format['format']:
            try:
                mime = ext_to_mime(ext)
                format['format'] = mime
            except Exception as e:
                if self.logger:
                    self.logger.error(e.message)

        # clean up disk space
        os.remove(file_to_sign)

        return (False, True)