def read_in_memory(input_data): """ Reads input text """ segments = [] for line in input_data.splitlines(): segments.append(segment({"text": line.strip()})) return segments
def read_file(file_name): """ Reads a TXT file """ segments = [] with open(file_name, encoding="utf-8") as f: for line in f: segments.append(segment({'text': line.strip()})) return segments
def parse_segment(input_seg): """ Creates an asrtoolkit segment object from an input gk segment :param: input_seg: dict (segment-level dict: input_data['segments'][i] -> dict with keys 'channel', 'startTimeSec' etc mapping to attributes :return: asrtoolkit segment object """ extracted_dict = {} def assign_if_present(value, dict_key=None, interior_key=None, proc_val=lambda val: val): """ This transforms gk segment data into a dictionary for input into the asrtoolkit segment object Assigns value to extracted_dict object if present in input_seg :param value: key from the inside of gk segment :param dict_key: key to which value should be assigned :param interior_key: sometimes values are nested under this :param proc_val: function formatting the value """ dict_key = value if dict_key is None else dict_key ret_val = None if value in input_seg and interior_key and interior_key in input_seg[ value]: ret_val = proc_val(input_seg[value][interior_key]) elif value in input_seg and not interior_key: ret_val = proc_val(input_seg[value]) if ret_val not in {"", None}: extracted_dict[dict_key] = ret_val seg = None try: assign_if_present("channel") assign_if_present("startTimeSec", "start") assign_if_present("stopTimeSec", "stop") assign_if_present("endTimeSec", "stop") assign_if_present("transcript", "text") assign_if_present("corrected_transcript", "text") assign_if_present("formatted_transcript", "formatted_text") assign_if_present("punctuated_transcript", "formatted_text") assign_if_present("speakerInfo", "speaker", proc_val=sanitize) assign_if_present("genderInfo", "label", "gender", lambda gender: "<o,f0,{:}>".format(gender)) assign_if_present("confidence", "confidence") seg = segment(extracted_dict) except Exception as exc: LOGGER.exception(exc) return seg if seg and seg.validate() else None
def parse_segment(input_seg): """ Creates a segment object from an input GreenKey segment :param: input_seg: dict (segment-level dict: input_data['segments'][i]['segment']) -> dict with keys 'channel', 'startTimeSec' etc mapping to segment object attributes :return: segment object; attribute values are set to those of corresponding segment-dict keys segment.start = segment_dict['startTimeSec'] (reverse mapping from format_segment) """ extracted_dict = {} def assign_if_present(value, dict_key=None, interior_key=None, proc_val=lambda val: val): """ :param value: type? :param dict_key: :param interior_key: :param proc_val: :return: type? Assigns value to extracted_dict object if present in input_seg """ dict_key = value if dict_key is None else dict_key if value in input_seg and interior_key and interior_key in input_seg[ value]: extracted_dict[dict_key] = proc_val(input_seg[value][interior_key]) elif value in input_seg and not interior_key: extracted_dict[dict_key] = proc_val(input_seg[value]) seg = None try: assign_if_present("channel") assign_if_present("startTimeSec", "start") assign_if_present("stopTimeSec", "stop") assign_if_present("endTimeSec", "stop") assign_if_present("transcript", "text") assign_if_present("corrected_transcript", "text") assign_if_present("formatted_transcript", "formatted_text") assign_if_present("punctuated_transcript", "formatted_text") assign_if_present("speakerInfo", "speaker", "ID") assign_if_present("genderInfo", "label", "gender", lambda gender: "<o,f0,{:}>".format(gender)) assign_if_present("confidence", "confidence") seg = segment(extracted_dict) except Exception as exc: LOGGER.exception(exc) return seg if seg and seg.validate() else None
def parse_line(line): " parse a single line of an html file" cols = line.findAll("td") seg = None if cols: start_stop, speaker, text = [[val for val in col.children][0] for col in cols] start, stop = start_stop[1:-1].split(" - ") seg = segment({ "speaker": speaker, "start": start, "stop": stop, "text": text }) seg = seg if seg.validate() else None return seg
def parse_line(line): " parse a single line of an html file" cols = line.findAll('td') seg = None if cols: start_stop, speaker, text = [[val for val in col.children][0] for col in cols] start, stop = start_stop[1:-1].split(" - ") seg = segment({ 'speaker': speaker, 'start': start, 'stop': stop, 'text': text }) seg = seg if seg.validate() else None return seg
def read_file(file_name): """ Reads an RTTM file """ segments = [] with open(file_name) as data: for line in data: _, filename, channel, start, duration, _, _, speaker, _, _ = line.split( ) seg = segment(**dict( filename=filename, channel=channel, start=start, stop=start + duration, speaker=speaker, )) segments.append(seg) return segments
def read_caption(caption): """ Parses caption object to return a segment object """ seg = None try: start = caption.start_in_seconds stop = caption.end_in_seconds text = re.sub(non_transcript_marks, lambda v: "", caption.text.strip()).strip() seg = segment({'start': start, 'stop': stop, 'text': text}) except Exception as exc: seg = None print(exc) return seg if seg and seg.validate() else None
def parse_line(line): " parse a single line of an stm file" data = line.strip().split() seg = None if len(data) > 6: audiofile, channel, speaker, start, stop, label = data[:6] text = " ".join(data[6:]) seg = segment({ 'audiofile': audiofile, 'channel': channel, 'speaker': speaker, 'start': start, 'stop': stop, 'label': label, 'text': text }) return seg if seg and seg.validate() else None
def parse_line(line): """ :param line: str; a single line of an stm file :return: segment object if STM file line contains accurately formatted data; else None """ data = line.strip().split() seg = None if len(data) > 6: filename, channel, speaker, start, stop, label = data[:6] text = " ".join(data[6:]) seg = segment({ "filename": filename, "channel": channel, "speaker": speaker, "start": start, "stop": stop, "label": label, "text": text, }) return seg if (seg is not None) and seg.validate() else None
def parse_line(line): """ :param line: str; a single line of an stm file :return: segment object if STM file line contains accurately formatted data; else None """ data = line.strip().split() seg = None if len(data) > 6: filename, channel, speaker, start, stop, label = data[:6] text = " ".join(data[6:]) seg = segment({ 'filename': filename, 'channel': channel, 'speaker': speaker, 'start': start, 'stop': stop, 'label': label, 'text': text, }) return seg if (seg is not None) and seg.validate() else None