Пример #1
0
def corrected_ngrams(
    refs_hyps_dir=f"{os.environ['HOME']}/googledrive/data/asr_data/results/kenlm_3_089_mp3",
    order=3,
):

    refs = data_io.read_lines(f"{refs_hyps_dir}/refs.txt.gz")
    hyps = data_io.read_lines(f"{refs_hyps_dir}/hyps.txt.gz")

    g = (
        " ".join(ngram)
        for ref, hyp in tqdm(zip(refs, hyps))
        for ngram in calc_corrected_ngrams(tokenize(ref), tokenize(hyp), order)
    )
    return list(g)
Пример #2
0
def topicalchat(
    file_name="train",
    data_path=os.environ["HOME"] + "/data/QA/topical-chat/processed_output",
    limit=None,
):
    backgrounds = data_io.read_lines(os.path.join(data_path, file_name) +
                                     ".fct",
                                     limit=limit)
    dialogs = data_io.read_lines(os.path.join(data_path, file_name) + ".src",
                                 limit=limit)
    targets = data_io.read_lines(os.path.join(data_path, file_name) + ".tgt",
                                 limit=limit)
    for b, d, t in tqdm(zip(backgrounds, dialogs, targets)):
        turns = d.split("_eos")[:-1] + [t.strip("_go").strip("_eos")]
        yield turns[-3:]
Пример #3
0
def doc_generator(file_path,limit=None):
    """
    PubTator docs are of the form:

    UID|TITLE|TEXT
    UID|ABSTRACT|TEXT
    UID   SPAN   MENTION   ENTITY_TYPE  MESH_ID
    ...

    See -- data/bioconcepts2pubtator_offsets.sample

    """
    k=0
    lines = []
    for line in data_io.read_lines(file_path):
        if len(line.rstrip()) == 0:
            if len(lines) > 0:
                # filter docs to target set
                doc_id = re.split(r'\|', lines[0].rstrip(), maxsplit=2)
                yield lines
                lines = []
                k+=1
                if limit is not None and k>limit:
                    break
        else:
            lines.append(line)
Пример #4
0
def preprocessed_topicalchat(
    file_name="train",
    data_path=os.environ["HOME"] + "/data/QA/topical-chat/processed_output",
    use_danqi=False,
):
    backgrounds = data_io.read_lines(
        os.path.join(data_path, file_name) + ".fct")
    dialogs = data_io.read_lines(os.path.join(data_path, file_name) + ".src")
    targets = data_io.read_lines(os.path.join(data_path, file_name) + ".tgt")
    for b, d, t in tqdm(zip(backgrounds, dialogs, targets)):
        turns = d.split("_eos")[:-1] + [t.strip("_go").strip("_eos")]

        if len(turns) % 2 != 0:
            turns = [SILENCE] + turns

        turns = [Turn(turns[k], turns[k + 1]) for k in range(0, len(turns), 2)]
        yield build_input_target(b, turns, SEP, use_danqi=use_danqi)
Пример #5
0
def parse_csv_to_examples_build_fields(dataset_name="AG_NEWS"):
    csv_path = DATA_PATH + "/" + dataset_name.lower() + "_csv"
    if not os.path.isdir(csv_path):
        # download data
        _, _ = text_classification.DATASETS[dataset_name](root=DATA_PATH)

    train_csv = csv_path + "/train.csv"
    test_csv = csv_path + "/test.csv"

    def regex_tokenizer(
        text,
        pattern=r"(?u)\b\w\w+\b"
    ) -> List[str]:  # pattern stolen from scikit-learn
        return [m.group() for m in re.finditer(pattern, text)]

    def parse_line(line):
        splits = line.split(",")
        text = " ".join(splits[1:])
        label = splits[0]
        return text, label

    text_field = Field(include_lengths=False,
                       batch_first=True,
                       tokenize=regex_tokenizer)
    label_field = Field(batch_first=True, sequential=False, unk_token=None)
    fields = [("text", text_field), ("label", label_field)]

    g = (parse_line(l) for l in tqdm(data_io.read_lines(train_csv)))
    train_examples = [
        Example.fromlist([text, label], fields) for text, label in g
    ]

    text_field.build_vocab([example.text for example in train_examples])
    label_field.build_vocab([example.label for example in train_examples])

    g = (parse_line(l) for l in tqdm(data_io.read_lines(test_csv)))
    test_examples = [
        Example.fromlist([text, label], fields) for text, label in g
    ]
    return train_examples, test_examples, fields
Пример #6
0
    def build_trees(file,dataset_name) -> Trees:
        trees = {}

        def add_triple(subj, predi, obje, trees: Trees):
            if subj not in trees:
                trees[subj] = {}
            if predi not in trees[subj]:
                trees[subj][predi] = set()
            trees[subj][predi].add(obje)

        for line in data_io.read_lines(file):
            s, o, p = line.strip().split("\t")
            s_id, o_id, p_id = get_id(ent2id, s), get_id(ent2id, o), get_id(rel2id, p)
            add_triple(s_id, p_id, o_id, trees)
            if 'train' in dataset_name:
                p_inv_id = get_id(rel2id, p + "_inv")
                add_triple(o_id, p_inv_id, s_id, trees)

        return trees
Пример #7
0
def create_or_load_raw_transcript(video_file, model_name) -> str:
    file = Path(f"{APP_DATA_DIR}/{video_file}")
    raw_transcript_file = (
        f"{SUBTITLES_DIR}/{file.stem}_{raw_transcript_name(model_name)}.txt")
    if not os.path.isfile(raw_transcript_file):
        asr = SpeechToText(model_name=model_name, ).init()
        transcript = convert_to_wav_transcribe(asr, str(file))
        data_io.write_lines(
            get_letters_csv(video_file, model_name),
            [f"{l.letter}\t{l.index}" for l in transcript.letters],
        )

        raw_transcript = "".join([l.letter for l in transcript.letters])
        data_io.write_lines(
            raw_transcript_file,
            [raw_transcript],
        )
    else:
        raw_transcript = list(data_io.read_lines(raw_transcript_file))[0]
    return raw_transcript
Пример #8
0
 def read_lines_from_files(
     self, path, mode="b", encoding="utf-8", limit=sys.maxsize
 ):
     c = 0
     for file in os.listdir(path):
         if self.state.get(file, 0) == "all":
             continue
         for line_idx, line in enumerate(
             data_io.read_lines(path + "/" + file, mode, encoding)
         ):
             c += 1
             if line_idx < self.state.get(file, 0):
                 continue
             if c > limit:
                 break
             yield line
             self.state[file] = line_idx
             if c % self.write_interval == 0:
                 data_io.write_json(self.state_file, self.state)
         self.state[file] = "all"
Пример #9
0
def segment_transcript_to_subtitle_blocks(
    transcript_letters_csv, translated_transcript: List[TranslatedTranscript]
) -> List[Dict[str, List[LetterIdx]]]:
    g = (line.split("\t")
         for line in data_io.read_lines(str(transcript_letters_csv)))
    raw_letters = [LetterIdx(l, int(i)) for l, i in g]
    assert all((raw_letters[k].r_idx > raw_letters[k - 1].r_idx
                for k in range(1, len(raw_letters))))

    subtitles = []
    letters = raw_letters
    for tt in sorted(translated_transcript, key=lambda x: x.order):
        aligned_letters = temporal_align_text_to_letters(tt.text, letters)
        subtitles.append((tt.name, aligned_letters))
        letters = aligned_letters  # align next transcript on already aligned, heuristic is to use language-similarities: native->spanish->english->german

    _, first_aligned = subtitles[0]
    named_blocks = [
        cut_block_out_of_transcript(subtitles, s, e)
        for s, e in generate_block_start_ends(first_aligned)
    ]
    return named_blocks
Пример #10
0
import os
from pprint import pprint
import json
from rouge import Rouge

from util import data_io
#
# with open('./tests/data.json') as f:
#   data = json.load(f)
#
# hyps, refs = map(list, zip(*[[d['hyp'], d['ref']] for d in data]))
rouge = Rouge()
# scores = rouge.get_scores(hyps, refs)
hyps = list(
    data_io.read_lines(
        os.environ['HOME'] +
        '/hpc/transformers/examples/summarization/bart/cnn_predicted_summaries.txt',
        limit=1000))
refs = list(
    data_io.read_lines(os.environ['HOME'] + '/hpc/data/cnn_dm/test.target',
                       limit=1000))
scores = rouge.get_scores(hyps, refs, avg=True)
pprint(scores)
'''
{'rouge-1': {'f': 0.2923597368088391,
             'p': 0.2430148556164662,
             'r': 0.38675644246961155},
 'rouge-2': {'f': 0.11613720277501925,
             'p': 0.09577817141610762,
             'r': 0.15578198219928469},
 'rouge-l': {'f': 0.28175826380865326,
             'p': 0.23867975605955946,
Пример #11
0
import os

from util import data_io

if __name__ == "__main__":
    base_path = os.environ["HOME"] + "/hpc/data/parallel_text_corpora/wmt_en_ro"
    files = [
        f for f in os.listdir(base_path)
        if f.endswith(".source") or f.endswith(".target")
    ]
    some_data = "some_data"
    os.makedirs("%s" % some_data, exist_ok=True)
    for f in files:
        data_io.write_lines(
            "%s/%s" % (some_data, f),
            data_io.read_lines(base_path + "/%s" % f, limit=1000),
        )
Пример #12
0
from time import time

import dask.bag as db
from tqdm import tqdm
from util import data_io
from dask.distributed import Client, progress

from alignment import calc_aligned_ngram_tuples

if __name__ == "__main__":
    client = Client()
    print(client)

    refs_hyps_dir = "/tmp/train_kenlm_3_089_mp3"
    refs = data_io.read_lines(f"{refs_hyps_dir}/refs.txt.gz")
    hyps = data_io.read_lines(f"{refs_hyps_dir}/hyps.txt.gz")
    refs_hyps = list((ref, hyp) for ref, hyp in zip(refs, hyps))

    tokenize = lambda s: s.split(" ")
    order = 3

    start = time()
    aligned_ngrams = (db.from_sequence(
        refs_hyps, npartitions=4 * 4).map(lambda rh: calc_aligned_ngram_tuples(
            tokenize(rh[0]), tokenize(rh[1]), order)).flatten().map(
                lambda rh: (" ".join(rh[0]), " ".join(rh[1]))))

    def error_rate(ref, hyp_counts: Counter):
        overall_num_erros = sum(v for k, v in hyp_counts.items() if ref != k)
        num_correct = hyp_counts[ref]
Пример #13
0
    "--target_file",
    default=os.environ["HOME"] +
    "/data/QA/topical-chat/processed_output/test_rare.tgt",
    type=str,
)
parser.add_argument(
    "--pred_file",
    default="test_rare.pred",
    type=str,
)

if __name__ == "__main__":
    args = parser.parse_args()

    rouge = Rouge()
    sources = [
        " "  # beginning with space? see: https://github.com/huggingface/transformers/blob/5ddd8d6531c8c49fdd281b55b93f6c81c9826f4b/examples/summarization/bart/evaluate_cnn.py#L66
        + x.rstrip() for x in data_io.read_lines(args.source_file)
    ]
    targets = list(data_io.read_lines(args.target_file))
    hyps = list(
        generate_summaries_or_translations(
            sources,
            args.model_path,
            batch_size=8,
            fp16=True,
        ))
    data_io.write_lines(args.pred_file, hyps)

    pprint(calc_rouge_scores(hyps, targets))
Пример #14
0
    type=str,
)
parser.add_argument(
    "--target_file",
    default=os.environ["HOME"] +
    "/gunther/Response-Generation-Baselines/processed_output/test_rare.tgt",
    type=str,
)


def calc_rouge_scores(pred: List[str], tgt: List[str]):
    rouge = Rouge()
    scores = rouge.get_scores(pred, tgt, avg=True)
    scores = {
        "f1-scores":
        {s: v
         for s, d in scores.items() for k, v in d.items() if k == "f"},
        "huggingface-rouge": calculate_rouge(pred, tgt)
    }
    return scores


if __name__ == "__main__":

    args = parser.parse_args()

    pred = list(data_io.read_lines(args.pred_file))
    tgt = list(data_io.read_lines(args.target_file))

    pprint(calc_rouge_scores(pred, tgt))
Пример #15
0
def build_id_text_generator(paths: List[str]):
    for p in paths:
        for file in Path(p).rglob("*.trans.txt"):
            for line in data_io.read_lines(str(file)):
                eid, text = line.split(" ", 1)
                yield eid, text