def test_processing_hbo(self):
        """
        Test downloading and processing some hebrew
        :return:
        """
        return
        rc_dir = download_rc('hbo', 'uhb',
                             'https://cdn.door43.org/hbo/uhb/v2.1.1/uhb.zip',
                             self.temp_dir)

        manifest = yaml.load(read_file(os.path.join(rc_dir, 'manifest.yaml')))
        usx_dir = os.path.join(rc_dir, 'usx')
        for project in manifest['projects']:
            pid = project['identifier']

            # copy usfm project file
            usfm_dir = os.path.join(self.temp_dir, 'usfm')
            if not os.path.exists(usfm_dir):
                os.makedirs(usfm_dir)
            usfm_dest_file = os.path.normpath(
                os.path.join(usfm_dir, project['path']))
            usfm_src_file = os.path.normpath(
                os.path.join(rc_dir, project['path']))
            shutil.copyfile(usfm_src_file, usfm_dest_file)

            # transform usfm to usx
            build_usx(usfm_dir, usx_dir)

            # clean up converted usfm file
            remove(usfm_dest_file, True)

            # convert USX to JSON
            path = os.path.normpath(
                os.path.join(usx_dir, '{}.usx'.format(pid.upper())))
            source = build_json_source_from_usx(path, 'hbo', pid, '2019')
Пример #2
0
    def _index_note_files(self, lid, rid, format, process_id, temp_dir):
        """

        :param lid:
        :param rid:
        :param format:
        :return: a dictionary of notes to upload
        """
        tn_uploads = {}

        format_str = format['format']
        if (rid == 'obs-tn' or rid == 'tn') and 'type=help' in format_str:
            self.logger.debug('Processing {}'.format(process_id))
            rc_dir = download_rc(lid, rid, format['url'], temp_dir,
                                 self.download_file)
            if not rc_dir: return {}

            tn_uploads = index_tn_rc(lid=lid, temp_dir=temp_dir, rc_dir=rc_dir)
            remove_tree(rc_dir, True)

        return tn_uploads
Пример #3
0
    def _process_usfm(self, lid, rid, resource, format, temp_dir):
        """
        Converts a USFM bundle into usx, loads the data into json and uploads it.
        Returns an array of usx file paths.
        :param lid:
        :param rid:
        :param format:
        :return: an array of json blobs
        """

        format_str = format['format']
        if 'application/zip' in format_str and 'usfm' in format_str:
            self.logger.debug('Downloading {}'.format(format['url']))
            rc_dir = download_rc(lid, rid, format['url'], temp_dir,
                                 self.download_file)
            if not rc_dir: return

            manifest = yaml.load(
                read_file(os.path.join(rc_dir, 'manifest.yaml')))
            usx_dir = os.path.join(rc_dir, 'usx')
            for project in manifest['projects']:
                pid = TsV2CatalogHandler.sanitize_identifier(
                    project['identifier'])
                # pid is project identifier, lid is language id, rid is resourceid
                process_id = '_'.join([lid, rid, pid])

                if process_id not in self.status['processed']:
                    self.logger.debug(
                        'Processing usfm for {}'.format(process_id))

                    # copy usfm project file
                    usfm_dir = os.path.join(temp_dir,
                                            '{}_usfm'.format(process_id))
                    if not os.path.exists(usfm_dir):
                        os.makedirs(usfm_dir)
                    usfm_dest_file = os.path.normpath(
                        os.path.join(usfm_dir, project['path']))
                    usfm_src_file = os.path.normpath(
                        os.path.join(rc_dir, project['path']))

                    if os.path.getsize(usfm_src_file) < self.max_usfm_size:

                        shutil.copyfile(usfm_src_file, usfm_dest_file)

                        # transform usfm to usx
                        build_usx(usfm_dir, usx_dir, self.logger)

                        # convert USX to JSON
                        path = os.path.normpath(
                            os.path.join(usx_dir,
                                         '{}.usx'.format(pid.upper())))
                        source = build_json_source_from_usx(
                            path, format['modified'], self)
                        upload = prep_data_upload(
                            '{}/{}/{}/v{}/source.json'.format(
                                pid, lid, rid, resource['version']),
                            source['source'], temp_dir)
                        self.logger.debug('Uploading {}/{}/{}'.format(
                            self.cdn_bucket, TsV2CatalogHandler.cdn_root_path,
                            upload['key']))
                        self.cdn_handler.upload_file(
                            upload['path'],
                            '{}/{}'.format(TsV2CatalogHandler.cdn_root_path,
                                           upload['key']))

                        self.status['processed'][process_id] = []
                    else:
                        self.logger.warn(
                            "Skipping {} because it is too big".format(
                                process_id))
                        self.status['processed'][process_id] = ['skipped']

                    self.status['timestamp'] = time.strftime(
                        "%Y-%m-%dT%H:%M:%SZ")
                    self.db_handler.update_item(
                        {'api_version': TsV2CatalogHandler.api_version},
                        self.status)
                else:
                    self.logger.debug(
                        'USFM for {} has already been processed'.format(
                            process_id))

            # clean up download
            remove_tree(rc_dir, True)
Пример #4
0
    def _index_words_files(self, lid, rid, format, process_id, temp_dir):
        """
        Returns an array of markdown files found in a tW dictionary
        :param lid:
        :param rid:
        :param format:
        :return:
        """
        word_title_re = re.compile('^#([^#\n]*)#*', re.UNICODE)
        h2_re = re.compile('^##([^#\n]*)#*', re.UNICODE)
        obs_example_re = re.compile('\_*\[([^\[\]]+)\]\(([^\(\)]+)\)_*(.*)',
                                    re.UNICODE | re.IGNORECASE)
        block_re = re.compile('^##', re.MULTILINE | re.UNICODE)
        word_links_re = re.compile(
            '\[([^\[\]]+)\]\(\.\.\/(kt|other)\/([^\(\)]+)\.md\)',
            re.UNICODE | re.IGNORECASE)
        ta_html_re = re.compile(
            '(<a\s+href="(:[a-z-_0-9]+:ta:vol\d:[a-z-\_]+:[a-z-\_]+)"\s*>([^<]+)<\/a>)',
            re.UNICODE | re.IGNORECASE)

        words = []
        format_str = format['format']
        if rid == 'tw' and 'type=dict' in format_str:
            self.logger.debug('Processing {}'.format(process_id))
            rc_dir = download_rc(lid, rid, format['url'], temp_dir,
                                 self.download_file)
            if not rc_dir: return {}

            manifest = yaml.load(
                read_file(os.path.join(rc_dir, 'manifest.yaml')))
            dc = manifest['dublin_core']

            # TRICKY: there should only be one project
            for project in manifest['projects']:
                pid = TsV2CatalogHandler.sanitize_identifier(
                    project['identifier'])
                content_dir = os.path.normpath(
                    os.path.join(rc_dir, project['path']))
                categories = os.listdir(content_dir)
                for cat in categories:
                    if cat in ['.', '..']: continue
                    cat_dir = os.path.join(content_dir, cat)
                    if not os.path.isdir(cat_dir): continue
                    word_files = os.listdir(cat_dir)
                    for word in word_files:
                        if word in ['.', '..', '.DS_Store']: continue
                        word_path = os.path.join(cat_dir, word)
                        word_id = word.split('.md')[0]
                        try:
                            word_content = read_file(word_path)
                        except Exception as e:
                            self.report_error(
                                'Failed to read file {}: {}'.format(
                                    word_path, e.message))
                            raise

                        # TRICKY: the title is always at the top
                        title_match = word_title_re.match(word_content)
                        if title_match:
                            title = title_match.group(1)
                        else:
                            self.report_error(
                                'missing title in {}'.format(word_path))
                            continue
                        word_content = word_title_re.sub('',
                                                         word_content).strip()

                        # TRICKY: the definition title is always after the title
                        def_title = ''
                        def_title_match = h2_re.match(word_content)
                        if def_title_match:
                            def_title = def_title_match.group(1).strip()
                            word_content = h2_re.sub('', word_content).strip()
                        else:
                            self.report_error(
                                'missing definition title in {}'.format(
                                    word_path))

                        # find obs examples
                        blocks = block_re.split(word_content)
                        cleaned_blocks = []
                        examples = []
                        for block in blocks:
                            if 'examples from the bible stories' in block.lower(
                            ):
                                for link in obs_example_re.findall(block):
                                    if 'obs' not in link[1]:
                                        self.logger.error(
                                            'non-obs link found in passage examples: {}'
                                            .format(link[1]))
                                    else:
                                        examples.append({
                                            'ref':
                                            link[0].replace(':', '-'),
                                            'text':
                                            markdown.markdown(link[2].strip())
                                        })
                            else:
                                cleaned_blocks.append(block)
                        word_content = '##'.join(cleaned_blocks)

                        # find all tW links and use them in related words
                        related_words = [
                            w[2] for w in word_links_re.findall(word_content)
                        ]

                        # convert links to legacy form. TODO: we should convert links after converting to html so we don't have to do it twice.
                        word_content = convert_rc_links(word_content)
                        word_content = markdown.markdown(word_content)
                        # convert html links back to dokuwiki links
                        # TRICKY: we converted the ta urls, but now we need to format them as dokuwiki links
                        # e.g. [[en:ta:vol1:translate:translate_unknown | How to Translate Unknowns]]
                        for ta_link in ta_html_re.findall(word_content):
                            new_link = u'[[{} | {}]]'.format(
                                ta_link[1], ta_link[2])
                            word_content = word_content.replace(
                                ta_link[0], new_link)

                        words.append({
                            'aliases': [
                                a.strip() for a in title.split(',')
                                if a.strip() != word_id
                                and a.strip() != title.strip()
                            ],
                            'cf':
                            related_words,
                            'def':
                            word_content,
                            'def_title':
                            def_title.rstrip(':'),
                            'ex':
                            examples,
                            'id':
                            word_id,
                            'sub':
                            '',
                            'term':
                            title.strip()
                        })

            remove_tree(rc_dir, True)

            if words:
                words.append({
                    'date_modified':
                    dc['modified'].replace('-', '').split('T')[0]
                })
                upload = prep_data_upload('bible/{}/words.json'.format(lid),
                                          words, temp_dir)
                return {'_'.join([lid, '*', '*', 'tw']): upload}
        return {}
Пример #5
0
    def _index_question_files(self, lid, rid, format, process_id, temp_dir):
        question_re = re.compile('^#+([^#\n]+)#*([^#]*)',
                                 re.UNICODE | re.MULTILINE | re.DOTALL)
        tq_uploads = {}

        format_str = format['format']
        if (rid == 'obs-tq' or rid == 'tq') and 'type=help' in format_str:
            self.logger.debug('Processing {}'.format(process_id))
            rc_dir = download_rc(lid, rid, format['url'], temp_dir,
                                 self.download_file)
            if not rc_dir: return {}

            manifest = yaml.load(
                read_file(os.path.join(rc_dir, 'manifest.yaml')))
            dc = manifest['dublin_core']

            for project in manifest['projects']:
                pid = TsV2CatalogHandler.sanitize_identifier(
                    project['identifier'])
                question_dir = os.path.normpath(
                    os.path.join(rc_dir, project['path']))
                question_json = []

                if not os.path.isdir(question_dir):
                    self.logger.warning(
                        'Missing directory at {}. Is the manifest out of date?'
                        .format(question_dir))
                    continue

                chapters = os.listdir(question_dir)
                for chapter in chapters:
                    if chapter in ['.', '..']: continue
                    unique_questions = {}
                    chapter_dir = os.path.join(question_dir, chapter)
                    chunks = os.listdir(chapter_dir)
                    for chunk in chunks:
                        if chunk in ['.', '..']: continue
                        chunk_file = os.path.join(chapter_dir, chunk)
                        chunk = chunk.split('.')[0]
                        chunk_body = read_file(chunk_file)

                        for question in question_re.findall(chunk_body):
                            hasher = hashlib.md5()
                            hasher.update(question[1].strip().encode('utf-8'))
                            question_hash = hasher.hexdigest()
                            if question_hash not in unique_questions:
                                # insert unique question
                                unique_questions[question_hash] = {
                                    'q': question[0].strip(),
                                    'a': question[1].strip(),
                                    'ref': [u'{}-{}'.format(chapter, chunk)]
                                }
                            else:
                                # append new reference
                                unique_questions[question_hash]['ref'].append(
                                    '{}-{}'.format(chapter, chunk))

                    question_array = []
                    for hash in unique_questions:
                        question_array.append(unique_questions[hash])
                    if question_array:
                        question_json.append({
                            'id': chapter,
                            'cq': question_array
                        })

                if question_json:
                    tq_key = '_'.join([lid, '*', pid, 'tq'])
                    question_json.append(
                        {'date_modified': dc['modified'].replace('-', '')})
                    upload = prep_data_upload(
                        '{}/{}/questions.json'.format(pid, lid), question_json,
                        temp_dir)
                    tq_uploads[tq_key] = upload
            remove_tree(rc_dir, True)
        return tq_uploads