Exemplo n.º 1
0
def process_corpora(conf_list, backend, reg_dir, variant, replace):
    for conf_file in conf_list:
        logging.getLogger(__name__).info('Processing {0}'.format(conf_file))
        with open(conf_file) as fr:
            conf = InstallJson()
            conf.update(fr)

            if replace:
                logging.getLogger(__name__).info(
                    'Removing existing record (including registry) for {0}.'.
                    format(conf.ident))
                backend.remove_corpus(conf.ident)

            if backend.contains_corpus(conf.ident):
                logging.getLogger(__name__).info(
                    'Corpus {0} already present - skipping.'.format(
                        conf.ident))
            else:
                backend.save_corpus_config(
                    conf, reg_dir, get_corpus_size(conf.ident, reg_dir))
                logging.getLogger(__name__).info(
                    'Saved config for {0}.'.format(conf.ident))

            if variant:
                reg_path = os.path.join(reg_dir, variant, conf.ident)
            else:
                reg_path = os.path.join(reg_dir, conf.ident)

            if os.path.isfile(reg_path):
                enc = infer_encoding(reg_path)
                with open(reg_path) as fr2:
                    parse_registry(fr2,
                                   variant=variant,
                                   backend=backend,
                                   encoding=enc)
Exemplo n.º 2
0
def process_corpora(conf_list, backend, reg_dir, variant, replace):
    for conf_file in conf_list:
        logging.getLogger(__name__).info('Processing {0}'.format(conf_file))
        with open(conf_file) as fr:
            conf = InstallJson()
            conf.update(fr)

            if replace:
                logging.getLogger(__name__).info(
                    'Removing existing record (including registry) for {0}.'.format(conf.ident))
                backend.remove_corpus(conf.ident)

            if backend.contains_corpus(conf.ident):
                logging.getLogger(__name__).info(
                    'Corpus {0} already present - skipping.'.format(conf.ident))
            else:
                backend.save_corpus_config(conf, reg_dir, get_corpus_size(conf.ident, reg_dir))
                logging.getLogger(__name__).info('Saved config for {0}.'.format(conf.ident))

            if variant:
                reg_path = os.path.join(reg_dir, variant, conf.ident)
            else:
                reg_path = os.path.join(reg_dir, conf.ident)

            if os.path.isfile(reg_path):
                enc = infer_encoding(reg_path)
                with open(reg_path) as fr2:
                    parse_registry(fr2, variant=variant, backend=backend, encoding=enc)
Exemplo n.º 3
0
def create_corp_record(node, db, shared, json_out, variant):
    ident = node.attrib['ident'].lower()
    web = node.attrib['web'] if 'web' in node.attrib else None
    tagset = node.attrib.get('tagset', None)
    speech_segment_struct, speech_segment_attr = fetch_structattr(
        node.attrib.get('speech_segment', None))
    default_virt_keyboard = node.attrib.get('default_virt_keyboard', None)
    speaker_id_struct, speaker_id_attr = fetch_structattr(node.attrib.get('speaker_id_attr', None))
    speech_overlap_struct, speech_overlap_attr = fetch_structattr(
        node.attrib.get('speech_overlap_attr', None))
    speech_overlap_val = node.attrib.get('speech_overlap_val', None)
    collator_locale = node.attrib.get('collator_locale', 'en_US')
    use_safe_font = decode_bool(node.attrib.get('use_safe_font', 'false'))
    sentence_struct = node.attrib['sentence_struct'] if 'sentence_struct' in node.attrib else None
    curr_time = time.time()
    group_name, version = InstallJson.create_sorting_values(ident)

    cursor = new_cursor(db)
    cursor.execute('INSERT INTO kontext_corpus (id, group_name, version, created, updated, active, web, '
                   'tagset, collator_locale, speech_overlap_val, use_safe_font, size, default_virt_keyboard) '
                   'VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)',
                   (ident, group_name, version, int(curr_time), int(curr_time), 1, web, tagset,
                    collator_locale, speech_overlap_val,
                    use_safe_font, shared.get_corpus_size(ident), default_virt_keyboard))

    # dependent structures and attrs
    if speech_segment_struct and speech_segment_attr:
        create_structattr(db, ident, speech_segment_struct, speech_segment_attr)

    if speaker_id_attr and speaker_id_struct:
        create_structattr(db, ident, speaker_id_struct, speaker_id_attr)

    if speech_overlap_struct and speech_overlap_attr:
        create_structattr(db, ident, speech_overlap_struct, speech_overlap_attr)

    if sentence_struct:
        create_structure(db, ident, sentence_struct)

    cursor.execute('UPDATE kontext_corpus SET '
                   'sentence_struct = ?, '
                   'speech_segment_struct = ?, speech_segment_attr = ?, speaker_id_struct = ?, '
                   'speaker_id_attr = ?, speech_overlap_struct = ?, speech_overlap_attr = ? '
                   'WHERE id = ?', (sentence_struct, speech_segment_struct, speech_segment_attr,
                                    speaker_id_struct, speaker_id_attr, speech_overlap_struct, speech_overlap_attr,
                                    ident))
    # json generator
    json_out.switch_to(ident)
    json_out.current.ident = ident
    json_out.current.web = web
    json_out.current.sentence_struct = sentence_struct
    json_out.current.tagset = tagset
    json_out.current.speech_segment = '{0}.{1}'.format(speech_segment_struct, speech_segment_attr)
    json_out.current.speaker_id_attr = speaker_id_attr
    json_out.current.speech_overlap_attr = speech_overlap_attr
    json_out.current.speech_overlap_val = speech_overlap_val
    json_out.current.collator_locale = collator_locale
    json_out.use_safe_font = use_safe_font
    create_metadata_record(db, shared, node, ident, json_out.current)
    json_out.metadata.default_virt_keyboard = default_virt_keyboard
    parse_tckc(node, db, ident, json_out.current)
Exemplo n.º 4
0
def create_corp_record(node, db, shared, json_out, variant):
    ident = node.attrib['ident'].lower()
    web = node.attrib['web'] if 'web' in node.attrib else None
    tagset = node.attrib.get('tagset', None)
    speech_segment_struct, speech_segment_attr = fetch_structattr(
        node.attrib.get('speech_segment', None))
    speaker_id_struct, speaker_id_attr = fetch_structattr(node.attrib.get('speaker_id_attr', None))
    speech_overlap_struct, speech_overlap_attr = fetch_structattr(
        node.attrib.get('speech_overlap_attr', None))
    speech_overlap_val = node.attrib.get('speech_overlap_val', None)
    collator_locale = node.attrib.get('collator_locale', 'en_US')
    use_safe_font = decode_bool(node.attrib.get('use_safe_font', 'false'))
    sentence_struct = node.attrib['sentence_struct'] if 'sentence_struct' in node.attrib else None
    curr_time = time.time()
    group_name, version = InstallJson.create_sorting_values(ident)

    cursor = new_cursor(db)
    cursor.execute('INSERT INTO kontext_corpus (id, group_name, version, created, updated, active, web, '
                   'tagset, collator_locale, speech_overlap_val, use_safe_font, size) '
                   'VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)',
                   (ident, group_name, version, int(curr_time), int(curr_time), 1, web, tagset,
                    collator_locale, speech_overlap_val,
                    use_safe_font, shared.get_corpus_size(ident)))

    # dependent structures and attrs
    if speech_segment_struct and speech_segment_attr:
        create_structattr(db, ident, speech_segment_struct, speech_segment_attr)

    if speaker_id_attr and speaker_id_struct:
        create_structattr(db, ident, speaker_id_struct, speaker_id_attr)

    if speech_overlap_struct and speech_overlap_attr:
        create_structattr(db, ident, speech_overlap_struct, speech_overlap_attr)

    if sentence_struct:
        create_structure(db, ident, sentence_struct)

    cursor.execute('UPDATE kontext_corpus SET '
                   'sentence_struct = ?, '
                   'speech_segment_struct = ?, speech_segment_attr = ?, speaker_id_struct = ?, '
                   'speaker_id_attr = ?, speech_overlap_struct = ?, speech_overlap_attr = ? '
                   'WHERE id = ?', (sentence_struct, speech_segment_struct, speech_segment_attr,
                                    speaker_id_struct, speaker_id_attr, speech_overlap_struct, speech_overlap_attr,
                                    ident))
    # json generator
    json_out.switch_to(ident)
    json_out.current.ident = ident
    json_out.current.web = web
    json_out.current.sentence_struct = sentence_struct
    json_out.current.tagset = tagset
    json_out.current.speech_segment = '{0}.{1}'.format(speech_segment_struct, speech_segment_attr)
    json_out.current.speaker_id_attr = speaker_id_attr
    json_out.current.speech_overlap_attr = speech_overlap_attr
    json_out.current.speech_overlap_val = speech_overlap_val
    json_out.current.collator_locale = collator_locale
    json_out.use_safe_font = use_safe_font
    create_metadata_record(db, shared, node, ident, json_out.current)
    parse_tckc(node, db, ident, json_out.current)
Exemplo n.º 5
0
def create_corp_record(node, db, shared, json_out):
    ident = node.attrib['ident'].lower()
    web = node.attrib['web'] if 'web' in node.attrib else None
    tagset = node.attrib.get('tagset', None)
    speech_segment = node.attrib.get('speech_segment', None)
    speaker_id_attr = node.attrib.get('speaker_id_attr', None)
    speech_overlap_attr = node.attrib.get('speech_overlap_attr', None)
    speech_overlap_val = node.attrib.get('speech_overlap_val', None)
    collator_locale = node.attrib.get('collator_locale', 'en_US')
    use_safe_font = decode_bool(node.attrib.get('use_safe_font', 'false'))
    sentence_struct = node.attrib[
        'sentence_struct'] if 'sentence_struct' in node.attrib else None
    curr_time = time.time()
    group_name, version = InstallJson.create_sorting_values(ident)

    cursor = db.cursor()
    cursor.execute(
        'INSERT INTO kontext_corpus (id, group_name, version, created, updated, active, web, '
        'tagset, collator_locale, speech_segment, speaker_id_attr,  speech_overlap_attr, '
        'speech_overlap_val, use_safe_font, size) '
        'VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)',
        (ident, group_name, version, int(curr_time), int(curr_time), 1, web,
         tagset, collator_locale, speech_segment, speaker_id_attr,
         speech_overlap_attr, speech_overlap_val, use_safe_font,
         shared.get_corpus_size(ident)))
    json_out.switch_to(ident)
    json_out.current.ident = ident
    json_out.current.web = web
    json_out.current.sentence_struct = sentence_struct
    json_out.current.tagset = tagset
    json_out.current.speech_segment = speech_segment
    json_out.current.speaker_id_attr = speaker_id_attr
    json_out.current.speech_overlap_attr = speech_overlap_attr
    json_out.current.speech_overlap_val = speech_overlap_val
    json_out.current.collator_locale = collator_locale
    json_out.use_safe_font = use_safe_font
    create_metadata_record(node, ident, db, shared, json_out.current)
    parse_tckc(node, db, ident, json_out.current)
    sentence_struct_id = create_initial_registry(db, shared, ident,
                                                 sentence_struct)
    if sentence_struct_id:
        cursor.execute(
            'UPDATE kontext_corpus SET sentence_struct_id = ? WHERE id = ?',
            (sentence_struct_id, ident))
Exemplo n.º 6
0
def create_corp_record(node, db, shared, json_out, variant, create_if_none):
    cursor = new_cursor(db)
    ident = node.attrib['ident'].lower()
    if not _corpus_exists(db, ident):
        if create_if_none:
            cursor.execute('INSERT INTO corpora (name) VALUES (%s)', (ident,))
        else:
            return
    web = node.attrib['web'] if 'web' in node.attrib else None
    tagset = node.attrib.get('tagset', None)
    speech_segment_struct, speech_segment_attr = fetch_structattr(
        node.attrib.get('speech_segment', None))
    speaker_id_struct, speaker_id_attr = fetch_structattr(node.attrib.get('speaker_id_attr', None))
    speech_overlap_struct, speech_overlap_attr = fetch_structattr(
        node.attrib.get('speech_overlap_attr', None))
    speech_overlap_val = node.attrib.get('speech_overlap_val', None)
    collator_locale = node.attrib.get('collator_locale', 'en_US')
    use_safe_font = decode_bool(node.attrib.get('use_safe_font', 'false'))
    sentence_struct = node.attrib['sentence_struct'] if 'sentence_struct' in node.attrib else None
    group_name, version = InstallJson.create_sorting_values(ident)

    t1 = datetime.datetime.now(tz=pytz.timezone('Europe/Prague')).strftime("%Y-%m-%dT%H:%M:%S%z")
    cursor.execute('UPDATE corpora SET group_name = %s, version = %s, updated = CURRENT_TIMESTAMP, '
                   'web = %s, tagset = %s, collator_locale = %s, speech_overlap_val = %s, use_safe_font = %s, '
                   'size = %s, created = %s, updated = %s '
                   'WHERE name = %s',
                   (group_name, version, web, tagset, collator_locale, speech_overlap_val, use_safe_font,
                    shared.get_corpus_size(ident), t1, t1, ident))

    # dependent structures and attrs
    if speech_segment_struct and speech_segment_attr:
        create_structattr(db, ident, speech_segment_struct, speech_segment_attr)

    if speaker_id_attr and speaker_id_struct:
        create_structattr(db, ident, speaker_id_struct, speaker_id_attr)

    if speech_overlap_struct and speech_overlap_attr:
        create_structattr(db, ident, speech_overlap_struct, speech_overlap_attr)

    if sentence_struct:
        create_structure(db, ident, sentence_struct)

    cursor.execute('UPDATE corpora SET '
                   'sentence_struct = %s, '
                   'speech_segment_struct = %s, speech_segment_attr = %s, speaker_id_struct = %s, '
                   'speaker_id_attr = %s, speech_overlap_struct = %s, speech_overlap_attr = %s '
                   'WHERE name = %s', (sentence_struct, speech_segment_struct, speech_segment_attr,
                                       speaker_id_struct, speaker_id_attr, speech_overlap_struct, speech_overlap_attr,
                                       ident))
    # json generator
    json_out.switch_to(ident)
    json_out.current.ident = ident
    json_out.current.web = web
    json_out.current.sentence_struct = sentence_struct
    json_out.current.tagset = tagset
    json_out.current.speech_segment = '{0}.{1}'.format(speech_segment_struct, speech_segment_attr)
    json_out.current.speaker_id_attr = speaker_id_attr
    json_out.current.speech_overlap_attr = speech_overlap_attr
    json_out.current.speech_overlap_val = speech_overlap_val
    json_out.current.collator_locale = collator_locale
    json_out.use_safe_font = use_safe_font
    create_metadata_record(db, shared, node, ident, json_out.current)
    parse_tckc(node, db, ident, json_out.current)
Exemplo n.º 7
0
 def switch_to(self, corpus_id):
     if corpus_id not in self._data:
         self._data[corpus_id] = InstallJson()
     self._current = self._data[corpus_id]
Exemplo n.º 8
0
def create_corp_record(node, db, shared, json_out, variant):
    ident = node.attrib['ident'].lower()
    if not _corpus_exists(db, ident):
        return
    web = node.attrib['web'] if 'web' in node.attrib else None
    tagset = node.attrib.get('tagset', None)
    speech_segment_struct, speech_segment_attr = fetch_structattr(
        node.attrib.get('speech_segment', None))
    speaker_id_struct, speaker_id_attr = fetch_structattr(
        node.attrib.get('speaker_id_attr', None))
    speech_overlap_struct, speech_overlap_attr = fetch_structattr(
        node.attrib.get('speech_overlap_attr', None))
    speech_overlap_val = node.attrib.get('speech_overlap_val', None)
    collator_locale = node.attrib.get('collator_locale', 'en_US')
    use_safe_font = decode_bool(node.attrib.get('use_safe_font', 'false'))
    sentence_struct = node.attrib[
        'sentence_struct'] if 'sentence_struct' in node.attrib else None
    group_name, version = InstallJson.create_sorting_values(ident)

    cursor = new_cursor(db)
    t1 = datetime.datetime.now(
        tz=pytz.timezone('Europe/Prague')).strftime("%Y-%m-%dT%H:%M:%S%z")
    cursor.execute(
        'UPDATE corpora SET group_name = %s, version = %s, updated = CURRENT_TIMESTAMP, '
        'web = %s, tagset = %s, collator_locale = %s, speech_overlap_val = %s, use_safe_font = %s, '
        'size = %s, created = %s, updated = %s '
        'WHERE name = %s',
        (group_name, version, web, tagset, collator_locale, speech_overlap_val,
         use_safe_font, shared.get_corpus_size(ident), t1, t1, ident))

    # dependent structures and attrs
    if speech_segment_struct and speech_segment_attr:
        create_structattr(db, ident, speech_segment_struct,
                          speech_segment_attr)

    if speaker_id_attr and speaker_id_struct:
        create_structattr(db, ident, speaker_id_struct, speaker_id_attr)

    if speech_overlap_struct and speech_overlap_attr:
        create_structattr(db, ident, speech_overlap_struct,
                          speech_overlap_attr)

    if sentence_struct:
        create_structure(db, ident, sentence_struct)

    cursor.execute(
        'UPDATE corpora SET '
        'sentence_struct = %s, '
        'speech_segment_struct = %s, speech_segment_attr = %s, speaker_id_struct = %s, '
        'speaker_id_attr = %s, speech_overlap_struct = %s, speech_overlap_attr = %s '
        'WHERE name = %s',
        (sentence_struct, speech_segment_struct, speech_segment_attr,
         speaker_id_struct, speaker_id_attr, speech_overlap_struct,
         speech_overlap_attr, ident))
    # json generator
    json_out.switch_to(ident)
    json_out.current.ident = ident
    json_out.current.web = web
    json_out.current.sentence_struct = sentence_struct
    json_out.current.tagset = tagset
    json_out.current.speech_segment = '{0}.{1}'.format(speech_segment_struct,
                                                       speech_segment_attr)
    json_out.current.speaker_id_attr = speaker_id_attr
    json_out.current.speech_overlap_attr = speech_overlap_attr
    json_out.current.speech_overlap_val = speech_overlap_val
    json_out.current.collator_locale = collator_locale
    json_out.use_safe_font = use_safe_font
    create_metadata_record(db, shared, node, ident, json_out.current)
    parse_tckc(node, db, ident, json_out.current)