Пример #1
0
def build_json_source_from_usx(path, lid, pid, date_modified, reporter=None):
    """
    Builds a json source object from a USX file
    :param path:
    :param date_modified:
    :param reporter: a lambda handler instance for reporting
    :type reporter: Handler
    :return:
    """
    # use utf-8-sig to remove the byte order mark
    with codecs.open(path, 'r', encoding='utf-8-sig') as in_file:
        usx = in_file.readlines()

    try:
        data = get_url(
            'https://cdn.door43.org/bible/txt/1/{}/chunks.json'.format(pid))
        chunks = index_chunks(json.loads(data))
    except:
        raise 'Failed to retrieve chunk information for {}'.format(path)

    book = usx_to_chunked_json(usx, chunks, lid, pid)

    return {
        'source': {
            'chapters': book,
            'date_modified': date_modified.replace('-', '').split('T')[0]
        }
    }
Пример #2
0
def download_chunks(pid):
    """
    Downloads the chunks for the bible book
    :param pid:
    :return: the chunk json data or None
    """
    try:
        data = get_url('https://cdn.door43.org/bible/txt/1/{}/chunks.json'.format(pid))
        return json.loads(data)
    except:
        return None
Пример #3
0
 def get_url(self, url, catch_exception=False):
     return get_url(url, catch_exception)
Пример #4
0
 def get_url(self, url):
     return get_url(url)
Пример #5
0
def index_tn_rc(lid, temp_dir, rc_dir, reporter=None):
    """
    Converts a v3 tN into it's v2 equivalent.
    This will write a bunch of files and return a list of files to be uploaded
    :param lid: the language id of the notes
    :param temp_dir: the directory where all the files will be written
    :param rc_dir: the directory of the resource container
    :param reporter: a lambda handler used for reporting
    :type reporter: Handler
    :return: a list of note files to upload
    """
    note_general_re = re.compile('^([^#]+)', re.UNICODE)
    note_re = re.compile('^#+([^#\n]+)#*([^#]*)', re.UNICODE | re.MULTILINE | re.DOTALL)
    tn_uploads = {}

    manifest = yaml.load(read_file(os.path.join(rc_dir, 'manifest.yaml')))
    dc = manifest['dublin_core']

    for project in manifest['projects']:
        pid = Handler.sanitize_identifier(project['identifier'])
        chunk_json = []
        if pid != 'obs':
            try:
                data = get_url('https://cdn.door43.org/bible/txt/1/{}/chunks.json'.format(pid))
                chunk_json = index_chunks(json.loads(data))
            except:
                if reporter:
                    reporter.report_error('Failed to retrieve chunk information for {}-{}'.format(lid, pid))
                continue

        note_dir = os.path.normpath(os.path.join(rc_dir, project['path']))
        note_json = []
        if not os.path.exists(note_dir):
            raise Exception('Project directory missing. Could not find {}'.format(note_dir))
        chapters = os.listdir(note_dir)

        for chapter in chapters:
            if chapter in ['.', '..', 'front']:
                continue
            chapter_dir = os.path.join(note_dir, chapter)
            verses = os.listdir(chapter_dir)
            verses.sort()

            notes = []
            firstvs = None
            note_hashes = []
            for verse in verses:
                if verse in ['.', '..', 'intro.md']:
                    continue

                # notes = []
                verse_file = os.path.join(chapter_dir, verse)
                verse = verse.split('.')[0]
                verse_body = read_file(verse_file)

                verse_body = convert_rc_links(verse_body)
                general_notes = note_general_re.search(verse_body)

                # close chunk
                chapter_key = chapter
                if firstvs is not None and (pid != 'obs' and chapter_key not in chunk_json):
                    # attempt to recover if Psalms
                    if pid == 'psa':
                        chapter_key = chapter_key.zfill(3)
                    else:
                        if reporter:
                            reporter.report_error(
                                'Could not find chunk data for {} {} {}'.format(rc_dir, pid, chapter_key))

                if firstvs is not None and (pid == 'obs' or verse in chunk_json[chapter_key]):
                    note_json.append({
                        'id': '{}-{}'.format(chapter, firstvs),
                        'tn': notes
                    })
                    firstvs = verse
                    notes = []
                elif firstvs is None:
                    firstvs = verse

                if general_notes:
                    verse_body = note_general_re.sub('', verse_body)
                    notes.append({
                        'ref': 'General Information',
                        'text': general_notes.group(0).strip()
                    })

                for note in note_re.findall(verse_body):
                    # TRICKY: do not include translation words in the list of notes
                    if note[0].strip().lower() != 'translationwords':
                        hasher = hashlib.md5()
                        hasher.update(note[0].strip().lower().encode('utf-8'))
                        note_hash = hasher.hexdigest()
                        if note_hash not in note_hashes:
                            note_hashes.append(note_hash)
                            notes.append({
                                'ref': note[0].strip(),
                                'text': note[1].strip()
                            })

            # close last chunk
            if firstvs is not None:
                note_json.append({
                    'id': '{}-{}'.format(chapter, firstvs),
                    'tn': notes
                })

        if note_json:
            tn_key = '_'.join([lid, '*', pid, 'tn'])
            note_json.append({'date_modified': dc['modified'].replace('-', '')})
            note_upload = prep_data_upload('{}/{}/notes.json'.format(pid, lid), note_json, temp_dir)
            tn_uploads[tn_key] = note_upload

    return tn_uploads