Exemplo n.º 1
0
def make_reg_lm(db_names, n):

    db_names_id = '_'.join(sorted(db_names))

    texts_filename = f'reg_lm_texts_{db_names_id}.txt'
    texts_filepath = os.path.join(PRETRAINED_DIR, texts_filename)

    if not os.path.isfile(texts_filepath):
        dataset = load_datasets(db_names)
        texts = []

        for e in dataset:

            good_lexes = [l for l in e.lexes if l['comment'] == 'good']

            for l in good_lexes:

                t = extract_text_reg_lm(l['text'], l['template'])
                if t:
                    texts.append(t)

        with open(texts_filepath, 'w', encoding='utf-8') as f:
            for t in texts:
                f.write(f'{t}\n')

    with open(texts_filepath, 'rb') as f:
        reg_lm_process = subprocess.run([KENLM, '-o', str(n)],
                                        stdout=subprocess.PIPE,
                                        input=f.read())

    lm_filename = f'reg_lm_model_{n}_{db_names_id}.arpa'
    lm_filepath = os.path.join(PRETRAINED_DIR, lm_filename)

    with open(lm_filepath, 'wb') as f:
        f.write(reg_lm_process.stdout)
Exemplo n.º 2
0
def make_sa_lm(db_names, n=2):

    dataset = load_datasets(db_names)
    db_names_id = '_'.join(sorted(db_names))

    sas = []

    for e in (e for e in dataset if len(e.triples) >= 2):
        partitionings = extract_partitionings(e)
        sas.extend(partitionings)

    sa_lm_texts_filename = f'txs_sa_texts_{db_names_id}.txt'
    sa_lm_texts_filepath = os.path.join(PRETRAINED_DIR, sa_lm_texts_filename)

    with open(sa_lm_texts_filepath, 'w') as f:
        for sa in sas:
            preprocessed_sa = preprocess_to_sa_model(sa)
            f.write(f'{preprocessed_sa}\n')

    with open(sa_lm_texts_filepath, 'rb') as f:
        #TODO: revisar esse --discount_fallback -> lembro de ter a ver com alguma limitação dos dados utilizados...
        txs_lm_process = subprocess.run(
            [KENLM, '--discount_fallback', '-o',
             str(n)],
            stdout=subprocess.PIPE,
            input=f.read())

    lm_filename = f'sa_lm_model_{n}_{db_names_id}.arpa'
    lm_filepath = os.path.join(PRETRAINED_DIR, lm_filename)
    with open(lm_filepath, 'wb') as f:
        f.write(txs_lm_process.stdout)
Exemplo n.º 3
0
def make_dp_lm(db_names, n=2):
    # constrói o modelo de scoring de discourse plans

    # lê as entradas dos bancos de nomes em db_names
    dataset = load_datasets(db_names)
    db_names_id = '_'.join(sorted(db_names))

    orders = []

    # para cada entrada, com len(triples) >= 2, extrai as ordens com que as triplas
    #    foram verbalizadas
    for e in dataset:
        if len(e.triples) >= 2:
            order = extract_orders(e)
            orders.extend(order)

    dp_lm_texts_filename = f'txs_dp_texts_{db_names_id}.txt'
    dp_lm_texts_filepath = os.path.join(PRETRAINED_DIR, dp_lm_texts_filename)

    with open(dp_lm_texts_filepath, 'w') as f:
        for order in orders:
            preprocessed_order = preprocess_to_dp_model(order)
            f.write(f'{preprocessed_order}\n')

    with open(dp_lm_texts_filepath, 'rb') as f:
        txs_lm_process = subprocess.run([KENLM, '-o', str(n)],
                                        stdout=subprocess.PIPE,
                                        input=f.read())

    lm_filename = f'dp_lm_model_{n}_{db_names_id}.arpa'
    lm_filepath = os.path.join(PRETRAINED_DIR, lm_filename)
    with open(lm_filepath, 'wb') as f:
        f.write(txs_lm_process.stdout)
Exemplo n.º 4
0
def make_text_selection_lm(db_names, n, lm_name):

    db_names_id = '_'.join(sorted(db_names))

    texts_filename = f'txs_lm_texts_{db_names_id}.txt'
    texts_filepath = os.path.join(PRETRAINED_DIR, texts_filename)

    if not os.path.isfile(texts_filepath):
        dataset = load_datasets(db_names)
        txs_lm_texts = [
            normalize_thiagos_template(l['text'].lower()) for e in dataset
            for l in e.lexes if l['comment'] == 'good'
        ]
        txs_lm_texts = [normalize_text(t) for t in txs_lm_texts]

        with open(texts_filepath, 'w', encoding='utf-8') as f:
            for t in txs_lm_texts:
                f.write(f'{t}\n')
    with open(texts_filepath, 'rb') as f:
        txs_lm_process = subprocess.run([KENLM, '-o', str(n)],
                                        stdout=subprocess.PIPE,
                                        input=f.read())

    lm_filename = f'txs_lm_model_{lm_name}_{n}_{db_names_id}.arpa'
    lm_filepath = os.path.join(PRETRAINED_DIR, lm_filename)

    with open(lm_filepath, 'wb') as f:
        f.write(txs_lm_process.stdout)
Exemplo n.º 5
0
def make_template_selection_lm(db_names, n, lm_name):

    db_names_id = '_'.join(sorted(db_names))

    texts_filename = f'tems_lm_texts_{lm_name}_{db_names_id}.txt'
    texts_filepath = os.path.join(PRETRAINED_DIR, texts_filename)

    if not os.path.isfile(texts_filepath):
        dataset = load_datasets(db_names)

        e_t = extract_templates(dataset)
        tems_lm_texts = make_template_lm_texts(e_t)

        with open(texts_filepath, 'w', encoding='utf-8') as f:
            for t in tems_lm_texts:
                f.write(f'{t}\n')

    with open(texts_filepath, 'rb') as f:
        tems_lm_process = subprocess.run([KENLM, '-o', str(n)],
                                         stdout=subprocess.PIPE,
                                         input=f.read())

    lm_filename = f'tems_lm_model_{lm_name}_{n}_{db_names_id}.arpa'
    lm_filepath = os.path.join(PRETRAINED_DIR, lm_filename)
    with open(lm_filepath, 'wb') as f:
        f.write(tems_lm_process.stdout)
Exemplo n.º 6
0
def make_template_db(db_names):

    db_names_id = '_'.join(sorted(db_names))
    dataset = load_datasets(db_names)
    e_t = extract_templates(dataset)

    template_db = defaultdict(set)

    for e, lexes_templates in e_t:

        for _, ts in lexes_templates:

            for t in [t for t in ts if t]:

                template_db[t.template_triples].add(t)

    template_db = dict(template_db)

    template_db_filename = f'template_db_{db_names_id}'
    template_db_filepath = os.path.join(PRETRAINED_DIR, template_db_filename)

    with open(template_db_filepath, 'wb') as f:
        pickle.dump(template_db, f)