Exemplo n.º 1
0
def read_in_memory(input_data):
    """
    Reads input text
    """
    segments = []
    for line in input_data.splitlines():
        segments.append(segment({"text": line.strip()}))
    return segments
Exemplo n.º 2
0
def read_file(file_name):
    """
    Reads a TXT file
  """
    segments = []
    with open(file_name, encoding="utf-8") as f:
        for line in f:
            segments.append(segment({'text': line.strip()}))
    return segments
Exemplo n.º 3
0
def parse_segment(input_seg):
    """
    Creates an asrtoolkit segment object from an input gk segment
    :param: input_seg: dict (segment-level dict: input_data['segments'][i]
      -> dict with keys 'channel', 'startTimeSec' etc mapping to attributes
    :return: asrtoolkit segment object
    """
    extracted_dict = {}

    def assign_if_present(value,
                          dict_key=None,
                          interior_key=None,
                          proc_val=lambda val: val):
        """
        This transforms gk segment data into a dictionary for input
        into the asrtoolkit segment object

        Assigns value to extracted_dict object if present in input_seg

        :param value:         key from the inside of gk segment
        :param dict_key:      key to which value should be assigned
        :param interior_key:  sometimes values are nested under this
        :param proc_val:      function formatting the value

        """
        dict_key = value if dict_key is None else dict_key
        ret_val = None
        if value in input_seg and interior_key and interior_key in input_seg[
                value]:
            ret_val = proc_val(input_seg[value][interior_key])
        elif value in input_seg and not interior_key:
            ret_val = proc_val(input_seg[value])
        if ret_val not in {"", None}:
            extracted_dict[dict_key] = ret_val

    seg = None
    try:
        assign_if_present("channel")
        assign_if_present("startTimeSec", "start")
        assign_if_present("stopTimeSec", "stop")
        assign_if_present("endTimeSec", "stop")
        assign_if_present("transcript", "text")
        assign_if_present("corrected_transcript", "text")
        assign_if_present("formatted_transcript", "formatted_text")
        assign_if_present("punctuated_transcript", "formatted_text")
        assign_if_present("speakerInfo", "speaker", proc_val=sanitize)
        assign_if_present("genderInfo", "label", "gender",
                          lambda gender: "<o,f0,{:}>".format(gender))
        assign_if_present("confidence", "confidence")

        seg = segment(extracted_dict)

    except Exception as exc:
        LOGGER.exception(exc)

    return seg if seg and seg.validate() else None
Exemplo n.º 4
0
def parse_segment(input_seg):
    """
    Creates a segment object from an input GreenKey segment
    :param: input_seg: dict  (segment-level dict: input_data['segments'][i]['segment'])
        -> dict with keys 'channel', 'startTimeSec' etc mapping to segment object attributes

    :return: segment object; attribute values are set to those of corresponding segment-dict keys

        segment.start = segment_dict['startTimeSec']         (reverse mapping from format_segment)
    """
    extracted_dict = {}

    def assign_if_present(value,
                          dict_key=None,
                          interior_key=None,
                          proc_val=lambda val: val):
        """
        :param value: type?
        :param dict_key:
        :param interior_key:
        :param proc_val:
        :return: type?

        Assigns value to extracted_dict object if present in input_seg
        """
        dict_key = value if dict_key is None else dict_key

        if value in input_seg and interior_key and interior_key in input_seg[
                value]:
            extracted_dict[dict_key] = proc_val(input_seg[value][interior_key])
        elif value in input_seg and not interior_key:
            extracted_dict[dict_key] = proc_val(input_seg[value])

    seg = None
    try:
        assign_if_present("channel")
        assign_if_present("startTimeSec", "start")
        assign_if_present("stopTimeSec", "stop")
        assign_if_present("endTimeSec", "stop")
        assign_if_present("transcript", "text")
        assign_if_present("corrected_transcript", "text")
        assign_if_present("formatted_transcript", "formatted_text")
        assign_if_present("punctuated_transcript", "formatted_text")
        assign_if_present("speakerInfo", "speaker", "ID")
        assign_if_present("genderInfo", "label", "gender",
                          lambda gender: "<o,f0,{:}>".format(gender))
        assign_if_present("confidence", "confidence")

        seg = segment(extracted_dict)

    except Exception as exc:
        LOGGER.exception(exc)

    return seg if seg and seg.validate() else None
Exemplo n.º 5
0
def parse_line(line):
    " parse a single line of an html file"
    cols = line.findAll("td")
    seg = None
    if cols:
        start_stop, speaker, text = [[val for val in col.children][0]
                                     for col in cols]
        start, stop = start_stop[1:-1].split(" - ")
        seg = segment({
            "speaker": speaker,
            "start": start,
            "stop": stop,
            "text": text
        })
        seg = seg if seg.validate() else None
    return seg
Exemplo n.º 6
0
def parse_line(line):
    " parse a single line of an html file"
    cols = line.findAll('td')
    seg = None
    if cols:
        start_stop, speaker, text = [[val for val in col.children][0]
                                     for col in cols]
        start, stop = start_stop[1:-1].split(" - ")
        seg = segment({
            'speaker': speaker,
            'start': start,
            'stop': stop,
            'text': text
        })
        seg = seg if seg.validate() else None
    return seg
Exemplo n.º 7
0
def read_file(file_name):
    """ Reads an RTTM file """

    segments = []
    with open(file_name) as data:
        for line in data:
            _, filename, channel, start, duration, _, _, speaker, _, _ = line.split(
            )
            seg = segment(**dict(
                filename=filename,
                channel=channel,
                start=start,
                stop=start + duration,
                speaker=speaker,
            ))
            segments.append(seg)
    return segments
Exemplo n.º 8
0
def read_caption(caption):
  """
    Parses caption object to return a segment object
  """
  seg = None

  try:
    start = caption.start_in_seconds
    stop = caption.end_in_seconds

    text = re.sub(non_transcript_marks, lambda v: "", caption.text.strip()).strip()

    seg = segment({'start': start, 'stop': stop, 'text': text})
  except Exception as exc:
    seg = None
    print(exc)

  return seg if seg and seg.validate() else None
Exemplo n.º 9
0
def parse_line(line):
    " parse a single line of an stm file"

    data = line.strip().split()

    seg = None
    if len(data) > 6:
        audiofile, channel, speaker, start, stop, label = data[:6]
        text = " ".join(data[6:])
        seg = segment({
            'audiofile': audiofile,
            'channel': channel,
            'speaker': speaker,
            'start': start,
            'stop': stop,
            'label': label,
            'text': text
        })
    return seg if seg and seg.validate() else None
Exemplo n.º 10
0
def parse_line(line):
    """
    :param line: str; a single line of an stm file
    :return: segment object if STM file line contains accurately formatted data; else None
    """
    data = line.strip().split()

    seg = None
    if len(data) > 6:
        filename, channel, speaker, start, stop, label = data[:6]
        text = " ".join(data[6:])
        seg = segment({
            "filename": filename,
            "channel": channel,
            "speaker": speaker,
            "start": start,
            "stop": stop,
            "label": label,
            "text": text,
        })
    return seg if (seg is not None) and seg.validate() else None
Exemplo n.º 11
0
def parse_line(line):
    """
    :param line: str; a single line of an stm file
    :return: segment object if STM file line contains accurately formatted data; else None
    """
    data = line.strip().split()

    seg = None
    if len(data) > 6:
        filename, channel, speaker, start, stop, label = data[:6]
        text = " ".join(data[6:])
        seg = segment({
            'filename': filename,
            'channel': channel,
            'speaker': speaker,
            'start': start,
            'stop': stop,
            'label': label,
            'text': text,
        })
    return seg if (seg is not None) and seg.validate() else None