Exemplo n.º 1
0
 def __init__(self, name, subtype, supertype, attribute=None, anchor=False, token=False, base=False, speaker=None):
     self.characters = set()
     self.ignored_characters = set()
     self.digraphs = set()
     self.trans_delimiter = None
     self.morph_delimiters = set()
     self.number_behavior = None
     self._list = []
     self.name = name
     self.subtype = subtype
     self.supertype = supertype
     self.token = token
     self.base = base
     self.anchor = anchor
     self.speaker = speaker
     self.ignored = False
     if self.speaker is not None:
         self.output_name = re.sub("{}\W*".format(self.speaker), "", self.name)
     else:
         self.output_name = self.name
     if attribute is None:
         if base:
             self.attribute = Attribute(Attribute.sanitize_name(name), "tier", name)
         else:
             self.attribute = Attribute(Attribute.sanitize_name(name), "spelling", name)
     else:
         self.attribute = attribute
Exemplo n.º 2
0
 def __init__(self, name, subtype, supertype, attribute = None, anchor = False,
                 token = False, base = False, speaker = None):
     self.characters = set()
     self.ignored_characters = set()
     self.digraphs = set()
     self.trans_delimiter = None
     self.morph_delimiters = set()
     self.number_behavior = None
     self._list = []
     self.name = name
     self.subtype = subtype
     self.supertype = supertype
     self.token = token
     self.base = base
     self.anchor = anchor
     self.speaker = speaker
     self.ignored = False
     if self.speaker is not None:
         self.output_name = re.sub('{}\W*'.format(self.speaker),'',self.name)
     else:
         self.output_name = self.name
     if attribute is None:
         if base:
             self.attribute = Attribute(Attribute.sanitize_name(name), 'tier', name)
         else:
             self.attribute = Attribute(Attribute.sanitize_name(name), 'spelling', name)
     else:
         self.attribute = attribute
def test_ilg_basic(ilg_test_dir):
    basic_path = os.path.join(ilg_test_dir, 'test_basic.txt')
    tier_att = Attribute('transcription','tier')
    tier_att.delimiter = '.'
    ats = [AnnotationType('spelling', 'transcription',
                                        None, token = False, anchor = True),
                                    AnnotationType('transcription', None, None,
                                        token = False, base = True,
                                        attribute = tier_att)]
    ats[1].trans_delimiter = '.'
    corpus = load_discourse_ilg('test', basic_path, ats)
    print(corpus.words)
    print(corpus.lexicon.words)
    assert(corpus.lexicon.find('a').frequency == 2)
Exemplo n.º 4
0
 def __init__(
     self,
     name,
     subtype,
     supertype,
     attribute=None,
     anchor=False,
     token=False,
     base=False,
     speaker=None,
     is_default=False,
 ):
     self.characters = set()
     self.ignored_characters = set()
     self.digraphs = set()
     self.trans_delimiter = None
     self.morph_delimiters = set()
     self.number_behavior = None
     self._list = [
     ]  #This list contains Annotations for spelling and BaseAnnotations for transcriptions
     self.name = name
     #This variable name is confusing - it represents something like "Orthography" or "Transcription", rather than
     #the name that the user would have given to the column, e.g. "canonical_pron" or "Spelling"
     #to get the user's preferred name, look self.output_name, or self.attribute
     self.subtype = subtype
     self.supertype = supertype
     self.token = token
     self.base = base  #base is transcription/tier type
     self.anchor = anchor  #anchor is spelling type
     self.speaker = speaker
     self.ignored = False
     self.is_default = is_default
     if self.speaker is not None:
         self.output_name = re.sub('{}\W*'.format(self.speaker), '',
                                   self.name)
     else:
         self.output_name = self.name
     if attribute is None:
         if base:
             self.attribute = Attribute(Attribute.sanitize_name(name),
                                        'tier',
                                        name,
                                        is_default=is_default)
         else:
             self.attribute = Attribute(Attribute.sanitize_name(name),
                                        'spelling',
                                        name,
                                        is_default=is_default)
     else:
         self.attribute = attribute
Exemplo n.º 5
0
def test_filter_model(qtbot):
    model = FilterModel()
    a = Attribute('test','numeric','Test')
    f = (a, '__eq__', 0)
    model.addRow(f)
    assert(model.data(model.index(0,0),Qt.DisplayRole) == 'Test == 0')

    model.removeRow(0)

    assert(len(model.filters) == 0)

    a = Attribute('test','factor','Test')
    f = (a, ['a','b','c'])
    model.addRow(f)
    assert(model.data(model.index(0,0),Qt.DisplayRole) == 'Test a, b, c')
Exemplo n.º 6
0
def test_ilg_data(ilg_test_dir):
    basic_path = os.path.join(ilg_test_dir, 'test_basic.txt')
    tier_att = Attribute('transcription', 'tier')
    tier_att.delimiter = '.'
    ats = [
        AnnotationType('spelling',
                       'transcription',
                       None,
                       token=False,
                       anchor=True),
        AnnotationType('transcription',
                       None,
                       None,
                       token=False,
                       base=True,
                       attribute=tier_att)
    ]
    ats[1].trans_delimiter = '.'
    data = ilg_to_data(basic_path, ats)

    expected_words = []
    a = Annotation('a')
    a.references.append('transcription')
    a.begins.append(0)
    a.ends.append(2)
    expected_words.append(a)

    a = Annotation('a')
    a.references.append('transcription')
    a.begins.append(2)
    a.ends.append(4)
    expected_words.append(a)

    a = Annotation('b')
    a.references.append('transcription')
    a.begins.append(4)
    a.ends.append(6)
    expected_words.append(a)

    assert (data['spelling']._list == expected_words)
    assert (data['transcription']._list == [
        BaseAnnotation('a'),
        BaseAnnotation('b'),
        BaseAnnotation('a'),
        BaseAnnotation('b'),
        BaseAnnotation('c'),
        BaseAnnotation('d')
    ])
Exemplo n.º 7
0
    def accept(self):
        tierName = self.nameEdit.text()
        self.attribute = Attribute(tierName.lower().replace(' ', ''), 'tier',
                                   tierName)
        if tierName == '':
            reply = QMessageBox.critical(self, "Missing information",
                                         "Please enter a name for the tier.")
            return
        elif self.attribute.name in self.corpus.basic_attributes:
            reply = QMessageBox.critical(
                self, "Invalid information",
                "The name '{}' overlaps with a protected column.".format(
                    tierName))
            return
        elif self.attribute in self.corpus.attributes:

            msgBox = QMessageBox(
                QMessageBox.Warning, "Duplicate tiers",
                "'{}' is already the name of a tier.  Overwrite?".format(
                    tierName), QMessageBox.NoButton, self)
            msgBox.addButton("Overwrite", QMessageBox.AcceptRole)
            msgBox.addButton("Cancel", QMessageBox.RejectRole)
            if msgBox.exec_() != QMessageBox.AcceptRole:
                return
        #createType = self.createType.currentText()
        #createList = self.createWidget.value()
        inClass, notInClass = self.generateClass()
        if not inClass:
            reply = QMessageBox.critical(
                self, "Missing information",
                "Please specify at least one segment or one feature value")
        self.segList = inClass
        QDialog.accept(self)
Exemplo n.º 8
0
    def accept(self):
        name = self.nameWidget.text()
        self.attribute = Attribute(name.lower().replace(' ', ''), 'numeric',
                                   name)
        if name == '':
            reply = QMessageBox.critical(self, "Missing information",
                                         "Please enter a name for the tier.")
            return
        elif self.attribute.name in self.corpus.basic_attributes:
            reply = QMessageBox.critical(
                self, "Invalid information",
                "The name '{}' overlaps with a protected column.".format(name))
            return
        elif self.attribute in self.corpus.attributes:

            msgBox = QMessageBox(
                QMessageBox.Warning, "Duplicate tiers",
                "'{}' is already the name of a tier.  Overwrite?".format(name),
                QMessageBox.NoButton, self)
            msgBox.addButton("Overwrite", QMessageBox.AcceptRole)
            msgBox.addButton("Cancel", QMessageBox.RejectRole)
            if msgBox.exec_() != QMessageBox.AcceptRole:
                return

        self.sequenceType = self.tierWidget.value()

        self.segList = self.segmentSelect.value()

        QDialog.accept(self)
Exemplo n.º 9
0
    def accept(self):
        name = self.nameWidget.text()
        at = self.typeWidget.currentText().lower()
        dv = self.defaultWidget.text()
        self.attribute = Attribute(name.lower().replace(' ', ''), at, name)
        if name == '':
            reply = QMessageBox.critical(self, "Missing information",
                                         "Please enter a name for the tier.")
            return
        elif self.attribute.name in self.corpus.basic_attributes:
            reply = QMessageBox.critical(
                self, "Invalid information",
                "The name '{}' overlaps with a protected column.".format(name))
            return
        elif self.attribute in self.corpus.attributes:

            msgBox = QMessageBox(
                QMessageBox.Warning, "Duplicate tiers",
                "'{}' is already the name of a tier.  Overwrite?".format(name),
                QMessageBox.NoButton, self)
            msgBox.addButton("Overwrite", QMessageBox.AcceptRole)
            msgBox.addButton("Cancel", QMessageBox.RejectRole)
            if msgBox.exec_() != QMessageBox.AcceptRole:
                return
        if at == 'numeric':
            try:
                dv = float(dv)
            except ValueError:
                reply = QMessageBox.critical(
                    self, "Invalid information",
                    "The default value for numeric columns must be a number")
                return
        self.attribute.default_value = dv
        QDialog.accept(self)
Exemplo n.º 10
0
    def accept(self):
        if self.cvradio.isChecked():
            tierName = 'CV skeleton'
            self.attribute = Attribute('cvskeleton', 'factor', 'CV skeleton')
            self.segList = self.generateSegList()

        if tierName == '':
            reply = QMessageBox.critical(self, "Missing information",
                                         "Please enter a name for the tier.")
            return
        if self.attribute.name in self.corpus.basic_attributes:
            reply = QMessageBox.critical(
                self, "Invalid information",
                "The name '{}' overlaps with a protected column.".format(
                    tierName))
            return
        elif self.attribute in self.corpus.attributes:

            msgBox = QMessageBox(
                QMessageBox.Warning, "Duplicate tiers",
                "'{}' is already the name of a tier.  Overwrite?".format(
                    tierName), QMessageBox.NoButton, self)
            msgBox.addButton("Overwrite", QMessageBox.AcceptRole)
            msgBox.addButton("Cancel", QMessageBox.RejectRole)
            if msgBox.exec_() != QMessageBox.AcceptRole:
                return

        QDialog.accept(self)
Exemplo n.º 11
0
def test_corpus_model(qtbot, specified_test_corpus, settings):
    model = CorpusModel(specified_test_corpus, settings)
    assert(model.headerData(0,Qt.Horizontal,Qt.DisplayRole) == 'Spelling')
    assert(model.headerData(1,Qt.Horizontal,Qt.DisplayRole) == 'Transcription')
    assert(model.headerData(2,Qt.Horizontal,Qt.DisplayRole) == 'Frequency')

    a = Attribute('test', 'spelling','Test2')

    model.addColumn(a)
    assert(model.headerData(3,Qt.Horizontal,Qt.DisplayRole) == 'Test2')

    model.removeAttributes(['Test2'])
    assert(len(model.columns) == 3)

    a = Attribute('test','factor','Test')

    model.addAbstractTier(a, {'C':['t','m']})
    assert(model.wordObject(0).test == 'CC')
    model.removeAttributes(['Test'])

    a = Attribute('test','numeric','Test')

    model.addCountColumn(a, 'transcription', ['t','m'])
    assert(model.wordObject(0).test == 2)
    model.removeAttributes(['Test'])

    a = Attribute('test','tier','Test')

    model.addTier(a, ['t','m'])
    assert(model.wordObject(0).test == ['t','m'])
    model.removeAttributes(['Test'])

    w = model.wordObject(0)
    assert(w.spelling == 'atema')
    w = Word(spelling = 'atema', transcription = [])
    model.replaceWord(0, w)
    w = model.wordObject(0)
    assert(w.spelling == 'atema' and w.transcription == [])
    model.hideNonLexical(True)
    w = model.wordObject(0)
    assert(w.spelling != 'atema')
    model.hideNonLexical(False)
    w = model.wordObject(0)
    assert(w.spelling == 'atema')
Exemplo n.º 12
0
def inspect_discourse_transcription(path):
    """
    Generate a list of AnnotationTypes for a specified text file for parsing
    it as a transcribed text

    Parameters
    ----------
    path : str
        Full path to text file

    Returns
    -------
    list of AnnotationTypes
        Autodetected AnnotationTypes for the text file
    """
    trans_delimiters = ['.', ';', ',']

    att = Attribute('transcription', 'tier', 'Transcription')
    a = AnnotationType('Transcription', None, None, attribute=att, base=True)

    if os.path.isdir(path):
        for root, subdirs, files in os.walk(path):
            for filename in files:
                if not filename.lower().endswith('.txt'):
                    continue
                with open(os.path.join(root, filename),
                          encoding='utf-8-sig',
                          mode='r') as f:
                    for line in f.readlines():
                        trial = line.strip().split()
                        if a.trans_delimiter is None:
                            for t in trial:
                                for delim in trans_delimiters:
                                    if delim in t:
                                        a.trans_delimiter = delim
                                        break

                        a.add(trial, save=False)
    else:
        with open(path, encoding='utf-8-sig', mode='r') as f:
            for line in f.readlines():
                trial = line.strip().split()
                if a.trans_delimiter is None:
                    for t in trial:
                        for delim in trans_delimiters:
                            if delim in t:
                                a.trans_delimiter = delim
                                break

                a.add(trial, save=False)
    annotation_types = [a]
    return annotation_types
Exemplo n.º 13
0
def test_ilg_basic(ilg_test_dir):
    basic_path = os.path.join(ilg_test_dir, 'test_basic.txt')
    tier_att = Attribute('transcription', 'tier')
    tier_att.delimiter = '.'
    ats = [
        AnnotationType('spelling',
                       'transcription',
                       None,
                       token=False,
                       anchor=True),
        AnnotationType('transcription',
                       None,
                       None,
                       token=False,
                       base=True,
                       attribute=tier_att)
    ]
    ats[1].trans_delimiter = '.'
    corpus = load_discourse_ilg('test', basic_path, ats)
    print(corpus.words)
    print(corpus.lexicon.words)
    assert (corpus.lexicon.find('a').frequency == 2)
def test_ilg_data(ilg_test_dir):
    basic_path = os.path.join(ilg_test_dir, 'test_basic.txt')
    tier_att = Attribute('transcription','tier')
    tier_att.delimiter = '.'
    ats = [AnnotationType('spelling', 'transcription',
                                        None, token = False, anchor = True),
                                    AnnotationType('transcription', None, None,
                                        token = False, base = True,
                                        attribute = tier_att)]
    ats[1].trans_delimiter = '.'
    data = ilg_to_data(basic_path, ats)

    expected_words = []
    a = Annotation('a')
    a.references.append('transcription')
    a.begins.append(0)
    a.ends.append(2)
    expected_words.append(a)

    a = Annotation('a')
    a.references.append('transcription')
    a.begins.append(2)
    a.ends.append(4)
    expected_words.append(a)

    a = Annotation('b')
    a.references.append('transcription')
    a.begins.append(4)
    a.ends.append(6)
    expected_words.append(a)

    assert(data['spelling']._list == expected_words)
    assert(data['transcription']._list == [BaseAnnotation('a'),
                                        BaseAnnotation('b'),
                                        BaseAnnotation('a'),
                                        BaseAnnotation('b'),
                                        BaseAnnotation('c'),
                                        BaseAnnotation('d')])
Exemplo n.º 15
0
    def __init__(self, name, subtype, supertype, attribute=None, anchor=False, token=False, base=False, speaker=None,
                 is_default=False):
        self.characters = set()
        self.ignored_characters = set()
        self.digraphs = set()
        self.trans_delimiter = None
        self.syllable_delimiter = None
        self.morph_delimiters = set()
        self.number_behavior = None

        self.stress_specification = dict()
        self.tone_specification = dict()

        self._list = []  #This list contains Annotations for spelling and BaseAnnotations for transcriptions
        self.name = name
        #This variable name is confusing - it represents something like "Orthography" or "Transcription", rather than
        #the name that the user would have given to the column, e.g. "canonical_pron" or "Spelling"
        #to get the user's preferred name, look self.output_name, or self.attribute
        self.subtype = subtype
        self.supertype = supertype
        self.token = token
        self.base = base #base is transcription/tier type
        self.anchor = anchor #anchor is spelling type
        self.speaker = speaker
        self.ignored = False
        self.is_default = is_default
        if self.speaker is not None:
            self.output_name = re.sub('{}\W*'.format(self.speaker),'',self.name)
        else:
            self.output_name = self.name
        if attribute is None:
            if base:
                self.attribute = Attribute(Attribute.sanitize_name(name), 'tier', name, is_default=is_default)
            else:
                self.attribute = Attribute(Attribute.sanitize_name(name), 'spelling', name, is_default=is_default)
        else:
            self.attribute = attribute
Exemplo n.º 16
0
def test_ilg_mismatched(ilg_test_dir):
    mismatched_path = os.path.join(ilg_test_dir, 'test_mismatched.txt')

    ats = [
        AnnotationType('spelling',
                       'transcription',
                       None,
                       token=False,
                       anchor=True),
        AnnotationType('transcription',
                       None,
                       None,
                       token=False,
                       base=True,
                       attribute=Attribute('transcription', 'tier'))
    ]
    ats[1].trans_delimiter = '.'
    with pytest.raises(ILGWordMismatchError):
        t = load_discourse_ilg('test', mismatched_path, ats)
Exemplo n.º 17
0
def inspect_discourse_ilg(path, number=None):
    """
    Generate a list of AnnotationTypes for a specified text file for parsing
    it as an interlinear gloss text file

    Parameters
    ----------
    path : str
        Full path to text file
    number : int, optional
        Number of lines per gloss, if not supplied, it is auto-detected

    Returns
    -------
    list of AnnotationTypes
        Autodetected AnnotationTypes for the text file
    """
    trans_delimiters = ['.', ';', ',']
    lines = {}
    if os.path.isdir(path):
        numbers = {}
        for root, subdirs, files in os.walk(path):
            for filename in files:
                if not filename.lower().endswith('.txt'):
                    continue
                p = os.path.join(root, filename)
                lines[p] = text_to_lines(p)
                numbers[p] = calculate_lines_per_gloss(lines[p])
        number = most_frequent_value(numbers)
    else:
        lines[path] = text_to_lines(path)
        number = calculate_lines_per_gloss(lines[path])
        p = path
    annotation_types = []
    for i in range(number):
        name = 'Line {}'.format(i + 1)
        if i == 0:
            att = Attribute('spelling', 'spelling', 'Spelling')
            a = AnnotationType(name,
                               None,
                               None,
                               anchor=True,
                               token=False,
                               attribute=att)
        else:
            labels = lines[p][i][1]
            cat = Attribute.guess_type(labels, trans_delimiters)
            att = Attribute(Attribute.sanitize_name(name), cat, name)
            a = AnnotationType(name,
                               None,
                               annotation_types[0].name,
                               token=False,
                               attribute=att)
            if cat == 'tier' and a.trans_delimiter is None:
                for l in labels:
                    for delim in trans_delimiters:
                        if delim in l:
                            a.trans_delimiter = delim
                            break
                    if a.trans_delimiter is not None:
                        break
        a.add(lines[p][i][1], save=False)
        annotation_types.append(a)
    for k, v in lines.items():
        if k == p:
            continue
        for i in range(number):
            labels = lines[k][i][1]
            annotation_types[i].add(labels, save=False)

    return annotation_types
Exemplo n.º 18
0
def data_to_discourse2(corpus_name=None, wav_path=None, annotation_types=None, support_corpus=None, ignore_case=False,
                       call_back=None, stop_check=None):
    curr_word = list()
    annotations = {at:list() for at in annotation_types}
    spelling_name, transcription_name = None, None
    if call_back is not None:
        call_back('Processing data...')
        cur = 0

    for at in annotation_types:
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            cur += 1
            call_back(cur)
        if all(isinstance(item, Annotation) for item in at._list):
            # it's a list of spellings, take each one and add it to the overall annotations list
            for item in at._list:
                if item.label:
                    annotations[at].append((item.label, None, None))

        elif all(type(item) == BaseAnnotation for item in at._list):
            #it's a list of transcriptions, with each segment as a BaseAnnotation
            for item in at._list:
                if item.begin is not None:
                    begin = item.begin
                if item.end is None:
                    curr_word.append(item)
                elif item.end is not None:
                    end = item.end
                    curr_word.append(item)
                    curr_word = Transcription(curr_word)
                    annotations[at].append((curr_word, begin, end))
                    curr_word = list()
        else:
            print(at._list)
            raise TypeError("AnnotationType._list cannot contain a mix of Annotations and BaseAnnotations")

    if support_corpus is not None:
        spellings = [value for key,value in annotations.items() if key.name=='Orthography (default)'][0]
        transcriptions = [key for key in annotations if key.name == 'Transcription'][0]
        for index, info in enumerate(spellings):
            spelling = info[0] #info[1] is the start time, info[2] is the end time (or else None)
            try:
                transcription = support_corpus.find(spelling, ignore_case=ignore_case).transcription
            except KeyError:
                try:
                    no_punctuation = ''.join([x for x in spelling if not x in string.punctuation])
                    transcription = support_corpus.find(no_punctuation, ignore_case=ignore_case).transcription
                except KeyError:
                    transcription = Transcription([symbol for symbol in spelling])
            annotations[transcriptions].append((transcription, index, index+1))


    discourse_kwargs = {'name': corpus_name, 'wav_path': wav_path, 'other_attributes': list()}
    for at in annotation_types:
        if at.name == 'Orthography (default)':
            discourse_kwargs['spelling_name'] = at.attribute
        elif at.name == 'Transcription (default)':
            discourse_kwargs['transcription_name'] = at.attribute
        elif at.name == 'Other (character)' or at.attribute.att_type in ('tier', 'spelling'):
            discourse_kwargs['other_attributes'].append(at.attribute)

    if 'spelling_name' not in discourse_kwargs:
        discourse_kwargs['spelling_name'] = Attribute('Spelling', 'spelling', 'Spelling')
    if 'transcription_name' not in discourse_kwargs:
        discourse_kwargs['transcription_name'] = Attribute('Transcription', 'tier', 'Transcription')

    if stop_check is not None and stop_check():
        return
    if call_back is not None:
        cur += 1
        call_back(cur)

    discourse = Discourse(discourse_kwargs)

    if not 'Frequency' in [a.name for a in discourse.lexicon.attributes]:
        # running text will not have a frequency attribute supplied by the user
        # textgrids are also unlikely to have this attribute
        discourse.lexicon.add_attribute(Attribute('frequency', 'numeric', 'Frequency'))
        add_frequency = True
    else:
        add_frequency = False

    ind = 0
    limit = max([len(list(v)) for v in annotations.values()])
    for n in range(limit):
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            cur += 1
            call_back(cur)

        word_kwargs = dict()
        for at in annotations:
            if at.token or at.ignored:
                continue
            else:
                try:
                    word_kwargs[at.attribute.name] = (at.attribute, annotations[at][n][0])
                    #annotations[at][n] should be a tuple of (curr_word, begin, end) or (item_label, None, None)
                except IndexError:
                    word_kwargs[at.attribute.name] = (at.attribute, None)
        word = Word(**word_kwargs)
        try:
            word = discourse.lexicon.find(word.spelling)
            if add_frequency:
                word.frequency += 1
        except KeyError:
            discourse.lexicon.add_word(word)

        word_token_kwargs = dict()
        word_token_kwargs['word'] = word
        begin, end = None, None
        for at in annotations:
            if at.ignored:
                continue
            try:
                word_token_kwargs[at.attribute.name] = (at.attribute, annotations[at][n][0])
                # annotations[at][n] should be a tuple of (curr_word, begin, end) or (item_label, None, None)
            except IndexError:
                word_token_kwargs[at.attribute.name] = (at.attribute, None)
            #word_token_kwargs[at.output_name] = (at.attribute, annotations[at][n][0])
            if at.attribute.att_type == 'tier':
                if at.attribute.is_default:
                    begin = annotations[at][n][1]
                    end = annotations[at][n][2]
                    word_token_kwargs['begin'] = begin if begin is not None else ind
                    word_token_kwargs['end'] = end if end is not None else ind + 1
                if at.token:
                    word_token_kwargs['_transcription'] = (at.attribute, annotations[at][n][0])
        word_token_kwargs['begin'] = begin if begin is not None else ind
        word_token_kwargs['end'] = end if end is not None else ind + 1
        word_token = WordToken(**word_token_kwargs)
        discourse.add_word(word_token)
        if any(a.token for a in annotations):
            word.wordtokens.append(word_token)
        ind += 1
    return discourse
Exemplo n.º 19
0
def inspect_csv(path, num_lines = 10, coldelim = None, transdelim = None):
    """
    Generate a list of AnnotationTypes for a specified text file for parsing
    it as a column-delimited file

    Parameters
    ----------
    path : str
        Full path to text file
    num_lines: int, optional
        The number of lines to parse from the file
    coldelim: str, optional
        A prespecified column delimiter to use, will autodetect if not
        supplied
    transdelim : list, optional
        A prespecfied set of transcription delimiters to look for, will
        autodetect if not supplied

    Returns
    -------
    list of AnnotationTypes
        Autodetected AnnotationTypes for the text file
    """
    if coldelim is not None:
        common_delimiters = [coldelim]
    else:
        common_delimiters = [',','\t',':','|']
    if transdelim is not None:
        trans_delimiters = [transdelim]
    else:
        trans_delimiters = ['.',' ', ';', ',']

    with open(path,'r', encoding='utf-8') as f:
        lines = []
        head = f.readline().strip()
        for line in f.readlines():
            lines.append(line.strip())
        #for i in range(num_lines):
        #    line = f.readline()
        #    if not line:
        #        break
        #    lines.append(line)

    best = ''
    num = 1
    for d in common_delimiters:
        trial = len(head.split(d))
        if trial > num:
            num = trial
            best = d
    if best == '':
        raise(DelimiterError('The column delimiter specified did not create multiple columns.'))

    head = head.split(best)
    vals = {h: list() for h in head}

    for line in lines:
        l = line.strip().split(best)
        if len(l) != len(head):
            raise(PCTError('{}, {}'.format(l,head)))
        for i in range(len(head)):
            vals[head[i]].append(l[i])
    atts = list()
    for h in head:
        cat = Attribute.guess_type(vals[h][:num_lines], trans_delimiters)
        att = Attribute(Attribute.sanitize_name(h), cat, h)
        a = AnnotationType(h, None, None, token = False, attribute = att)
        if cat == 'tier':
            for t in trans_delimiters:
                if t in vals[h][0]:
                    a.trans_delimiter = t
                    break
        a.add(vals[h], save = False)
        atts.append(a)

    return atts, best
Exemplo n.º 20
0
def spelling_annotation_type():
    a = AnnotationType('test', None, None)
    a.attribute = Attribute('test', 'spelling')
    return a
Exemplo n.º 21
0
def transcription_annotation_type():
    a = AnnotationType('test', None, None)
    a.trans_delimiter = '.'
    a.attribute = Attribute('test', 'tier')
    return a
Exemplo n.º 22
0
def numeric_annotation_type():
    a = AnnotationType('test', None, None)
    a.attribute = Attribute('test', 'numeric')
    return a
Exemplo n.º 23
0
def inspect_discourse_textgrid(path):
    """
    Generate a list of AnnotationTypes for a specified TextGrid file

    Parameters
    ----------
    path : str
        Full path to TextGrid file

    Returns
    -------
    list of AnnotationTypes
        Autodetected AnnotationTypes for the TextGrid file
    """
    trans_delimiters = ['.', ' ', ';', ',']
    textgrids = []
    if os.path.isdir(path):
        for root, subdirs, files in os.walk(path):
            for filename in files:
                if not filename.lower().endswith('.textgrid'):
                    continue
                textgrids.append(os.path.join(root, filename))
    else:
        textgrids.append(path)
    anno_types = []
    for t in textgrids:
        tg = load_textgrid(t)
        spellings, segments, attributes = guess_tiers(tg)
        if len(segments) == 0:
            base = None
        else:
            base = segments[0]
        if len(spellings) == 0:
            anchor = None
        else:
            anchor = spellings[0]
        interval_tiers = [x for x in tg.tiers if isinstance(x, IntervalTier)]
        if len(anno_types) == 0:
            for ti in interval_tiers:
                if ti.name in spellings:
                    a = AnnotationType(ti.name,
                                       base,
                                       None,
                                       anchor=True,
                                       token=False)
                elif ti.name in segments:
                    a = AnnotationType(ti.name,
                                       None,
                                       anchor,
                                       base=True,
                                       token=True)
                else:
                    labels = uniqueLabels(ti)
                    cat = Attribute.guess_type(labels, trans_delimiters)
                    att = Attribute(Attribute.sanitize_name(ti.name), cat,
                                    ti.name)
                    a = AnnotationType(ti.name,
                                       None,
                                       anchor,
                                       token=False,
                                       attribute=att)
                    if cat == 'tier':
                        for l in labels:
                            for delim in trans_delimiters:
                                if delim in l:
                                    a.trans_delimiter = delim
                                    break
                            if a.trans_delimiter is not None:
                                break
                a.add((x.mark for x in ti), save=False)
                anno_types.append(a)
        else:
            if len(anno_types) != len(interval_tiers):
                raise (PCTError(
                    "The TextGrids must have the same number of tiers."))
            for i, ti in enumerate(interval_tiers):
                anno_types[i].add((x.mark for x in ti), save=False)

    return anno_types
Exemplo n.º 24
0
    def generateKwargs(self):

        if self.useQuadratic.isChecked() and int(
                self.maxDistanceEdit.text()) != 1:
            self.useQuadratic.setChecked(False)

        if self.maxDistanceEdit.text() == '':
            max_distance = None
        else:
            max_distance = float(self.maxDistanceEdit.text())
        ##------------------
        try:
            frequency_cutoff = float(self.minFreqEdit.text())
        except ValueError:
            frequency_cutoff = 0.0
        ##-------------------
        alg = self.algorithmWidget.value()
        typeToken = self.typeTokenWidget.value()

        if self.fileRadio.isChecked():  #using list of words not in corpus
            file_type = self.fileOptions.currentText().split(' ')[-1].strip()
            for tiername in [
                    self.tierWidget.tierSelect.itemText(i)
                    for i in range(self.tierWidget.tierSelect.count())
            ]:
                if tiername == file_type:
                    self.tierWidget.tierSelect.setCurrentText(tiername)
                    break

        kwargs = {'corpusModel':self.corpusModel,
                'algorithm': alg,
                'context': self.variantsWidget.value(),
                'sequence_type':self.tierWidget.value(),#this is just a string
                'tier_type': self.tierWidget.attValue(),#this is an Attribute type object
                'type_token':typeToken,
                'max_distance':max_distance,
                'frequency_cutoff':frequency_cutoff,
                'num_cores':self.settings['num_cores'],
                'force_quadratic': self.useQuadratic.isChecked(),
                'file_type': self.fileOptions.currentText().split()[-1],
                'collapse_homophones': self.collapseHomophones.isChecked(),
                'output_format': self.saveFileFormat.currentText().split(' ')[-1].lower(),\
                'in_corpus': True}

        out_file = self.saveFileWidget.value()
        if out_file == '':
            out_file = None
        else:
            kwargs['output_filename'] = out_file

        kwargs['file_list'] = None
        if self.compType is None:
            reply = QMessageBox.critical(
                self, "Missing information",
                'Please select an option from the "Query" section in the middle of the window.'
            )
            return
        elif self.compType == 'one':
            text = self.oneWordEdit.text()
            if not text:
                reply = QMessageBox.critical(self, "Missing information",
                                             "Please specify a word.")
                return
            try:
                w = self.corpusModel.corpus.find(text)
            except KeyError:
                reply = QMessageBox.critical(
                    self, "Invalid information",
                    "The spelling specified does match any words in the corpus."
                )
                return
            kwargs['query'] = [w]
            kwargs['output_filename'] = out_file
        elif self.compType == 'nonword':
            if self.oneNonword is None:
                reply = QMessageBox.critical(self, "Missing information",
                                             "Please create a word/nonword.")
                return
            if not getattr(self.oneNonword, kwargs['sequence_type']):
                reply = QMessageBox.critical(
                    self, "Missing information",
                    "Please recreate the word/nonword with '{}' specified.".
                    format(self.tierWidget.displayValue()))
                return
            kwargs['query'] = [self.oneNonword]
            kwargs['in_corpus'] = False
            kwargs['output_filename'] = out_file
        elif self.compType == 'file':
            path = self.fileWidget.value()
            kwargs['file_list'] = path
            kwargs['in_corpus'] = False
            if not path:
                reply = QMessageBox.critical(self, "Missing information",
                                             "Please enter a file path.")
                return
            if not os.path.exists(path):
                reply = QMessageBox.critical(
                    self, "Invalid information",
                    "The file path entered was not found.")
                return
            kwargs['query'] = list()
            file_sequence_type = self.fileOptions.currentText().split(
                ' ')[-1].lower()
            text = load_words_neighden(path, file_sequence_type)
            for t in text:
                kwargs['query'].append(t)
        elif self.compType == 'all':
            column = self.columnEdit.text()
            if column == '':
                reply = QMessageBox.critical(self, "Missing information",
                                             "Please enter a column name.")
                return
            colName = column.replace(' ', '_')
            attribute = Attribute(colName, 'numeric', column)
            if column in self.corpusModel.columns:

                msgBox = QMessageBox(
                    QMessageBox.Warning, "Duplicate columns",
                    "'{}' is already the name of a column.  Overwrite?".format(
                        column), QMessageBox.NoButton, self)
                msgBox.addButton("Overwrite", QMessageBox.AcceptRole)
                msgBox.addButton("Cancel", QMessageBox.RejectRole)
                if msgBox.exec_() != QMessageBox.AcceptRole:
                    return
            kwargs['attribute'] = attribute
        return kwargs
Exemplo n.º 25
0
def inspect_discourse_textgrid(path):
    """
    Generate a list of AnnotationTypes for a specified TextGrid file

    Parameters
    ----------
    path : str
        Full path to TextGrid file

    Returns
    -------
    list of AnnotationTypes
        Autodetected AnnotationTypes for the TextGrid file
    """
    trans_delimiters = ['.',' ', ';', ',']
    textgrids = []
    if os.path.isdir(path):
        for root, subdirs, files in os.walk(path):
            for filename in files:
                if not filename.lower().endswith('.textgrid'):
                    continue
                textgrids.append(os.path.join(root,filename))
    else:
        textgrids.append(path)
    anno_types = []
    for t in textgrids:
        tg = load_textgrid(t)
        spellings, segments, attributes = guess_tiers(tg)
        if len(segments) == 0:
            base = None
        else:
            base = segments[0]
        if len(spellings) == 0:
            anchor = None
        else:
            anchor = spellings[0]
        interval_tiers = [x for x in tg.tiers if isinstance(x, IntervalTier)]
        if len(anno_types) == 0:
            for ti in interval_tiers:
                if ti.name in spellings:
                    a = AnnotationType(ti.name, base, None, anchor = True, token = False)
                elif ti.name in segments:
                    a = AnnotationType(ti.name, None, anchor, base = True, token = True)
                else:
                    labels = uniqueLabels(ti)
                    cat = Attribute.guess_type(labels, trans_delimiters)
                    att = Attribute(Attribute.sanitize_name(ti.name), cat, ti.name)
                    a = AnnotationType(ti.name, None, anchor, token = False, attribute = att)
                    if cat == 'tier':
                        for l in labels:
                            for delim in trans_delimiters:
                                if delim in l:
                                    a.trans_delimiter = delim
                                    break
                            if a.trans_delimiter is not None:
                                break
                a.add((x.mark for x in ti), save = False)
                anno_types.append(a)
        else:
            if len(anno_types) != len(interval_tiers):
                raise(PCTError("The TextGrids must have the same number of tiers."))
            for i, ti in enumerate(interval_tiers):
                anno_types[i].add((x.mark for x in ti), save = False)

    return anno_types
Exemplo n.º 26
0
def inspect_discourse_ilg(path, number = None):
    """
    Generate a list of AnnotationTypes for a specified text file for parsing
    it as an interlinear gloss text file

    Parameters
    ----------
    path : str
        Full path to text file
    number : int, optional
        Number of lines per gloss, if not supplied, it is auto-detected

    Returns
    -------
    list of AnnotationTypes
        Autodetected AnnotationTypes for the text file
    """
    trans_delimiters = ['.', ';', ',']
    lines = {}
    if os.path.isdir(path):
        numbers = {}
        for root, subdirs, files in os.walk(path):
            for filename in files:
                if not filename.lower().endswith('.txt'):
                    continue
                p = os.path.join(root, filename)
                lines[p] = text_to_lines(p)
                numbers[p] = calculate_lines_per_gloss(lines[p])
        number = most_frequent_value(numbers)
    else:
        lines[path] = text_to_lines(path)
        number = calculate_lines_per_gloss(lines[path])
        p = path
    annotation_types = []
    for i in range(number):
        name = 'Line {}'.format(i+1)
        if i == 0:
            att = Attribute('spelling','spelling','Spelling')
            a = AnnotationType(name, None, None, anchor = True, token = False, attribute = att)
        else:
            labels = lines[p][i][1]
            cat = Attribute.guess_type(labels, trans_delimiters)
            att = Attribute(Attribute.sanitize_name(name), cat, name)
            a = AnnotationType(name, None, annotation_types[0].name, token = False, attribute = att)
            if cat == 'tier' and a.trans_delimiter is None:
                for l in labels:
                    for delim in trans_delimiters:
                        if delim in l:
                            a.trans_delimiter = delim
                            break
                    if a.trans_delimiter is not None:
                        break
        a.add(lines[p][i][1], save = False)
        annotation_types.append(a)
    for k,v in lines.items():
        if k == p:
            continue
        for i in range(number):
            labels = lines[k][i][1]
            annotation_types[i].add(labels, save = False)

    return annotation_types
Exemplo n.º 27
0
def inspect_csv(path, num_lines = 10, coldelim = None, transdelim = None):
    """
    Generate a list of AnnotationTypes for a specified text file for parsing
    it as a column-delimited file

    Parameters
    ----------
    path : str
        Full path to text file
    num_lines: int, optional
        The number of lines to parse from the file
    coldelim: str, optional
        A prespecified column delimiter to use, will autodetect if not
        supplied
    transdelim : list, optional
        A prespecfied set of transcription delimiters to look for, will
        autodetect if not supplied

    Returns
    -------
    list of AnnotationTypes
        Autodetected AnnotationTypes for the text file
    """
    if coldelim is not None:
        common_delimiters = [coldelim]
    else:
        common_delimiters = [',','\t',':','|']
    if transdelim is not None:
        trans_delimiters = [transdelim]
    else:
        trans_delimiters = ['.',' ', ';', ',']

    with open(path,'r', encoding='utf-8') as f:
        lines = []
        head = f.readline().strip()
        for line in f.readlines():
            lines.append(line.strip())
        #for i in range(num_lines):
        #    line = f.readline()
        #    if not line:
        #        break
        #    lines.append(line)

    best = ''
    num = 1
    for d in common_delimiters:
        trial = len(head.split(d))
        if trial > num:
            num = trial
            best = d
    if best == '':
        raise(DelimiterError('The column delimiter specified did not create multiple columns.'))

    head = head.split(best)
    vals = {h: list() for h in head}

    for line in lines:
        l = line.strip().split(best)
        if len(l) != len(head):
            raise(PCTError('{}, {}'.format(l,head)))
        for i in range(len(head)):
            vals[head[i]].append(l[i])
    atts = list()
    for h in head:
        cat = Attribute.guess_type(vals[h][:num_lines], trans_delimiters)
        att = Attribute(Attribute.sanitize_name(h), cat, h)
        a = AnnotationType(h, None, None, token = False, attribute = att)
        if cat == 'tier':
            for t in trans_delimiters:
                if t in vals[h][0]:
                    a.trans_delimiter = t
                    break
        a.add(vals[h], save = False)
        atts.append(a)

    return atts, best
Exemplo n.º 28
0
def spelling_text_to_data(corpus_name, path, annotation_types = None,
                            support_corpus_path = None, ignore_case = True,
                            stop_check = None, call_back = None):
    name = corpus_name
    if annotation_types is None:
        annotation_types = inspect_discourse_spelling(path, support_corpus_path)

    if support_corpus_path is not None:
        if isinstance(support_corpus_path, Corpus):
            support = support_corpus_path
        else:
            if not os.path.exists(support_corpus_path):
                raise(PCTOSError("The corpus path specified ({}) does not exist".format(support_corpus_path)))
            support = load_binary(support_corpus_path)

        a = AnnotationType('Transcription', None, None,
                           attribute=Attribute('Transcription', 'transcription', 'Transcription'),
                           base=True, is_default=True)
        annotation_types.append(a)

    for a in annotation_types:
        a.reset()

    data = DiscourseData(name, annotation_types)

    lines = text_to_lines(path)
    if call_back is not None:
        call_back('Processing file...')
        call_back(0, len(lines))
        cur = 0

    for line in lines:
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            cur += 1
            if cur % 20 == 0:
                call_back(cur)
        if not line or line == '\n':
            continue
        annotations = {}
        for word in line:
            spell = word.strip()
            spell = ''.join(x for x in spell if not x in data['Spelling'].ignored_characters)
            if spell == '':
                continue
            word = Annotation(spell)
            if support_corpus_path is not None:
                trans = None
                try:
                    trans = support.find(spell, ignore_case = ignore_case).transcription
                except KeyError:
                    trans = []
                n = data.base_levels[0]
                tier_elements = [BaseAnnotation(x) for x in trans]
                level_count = data.level_length(n)
                word.references.append(n)
                word.begins.append(level_count)
                word.ends.append(level_count + len(tier_elements))
                annotations[n] = tier_elements
            annotations['Spelling'] = [word]
            data.add_annotations(**annotations)

    return data
Exemplo n.º 29
0
    def generateKwargs(self):
        ##------------------
        try:
            frequency_cutoff = float(self.minFreqEdit.text())
        except ValueError:
            frequency_cutoff = 0.0
        ##-------------------

        kwargs = {
            'corpusModel': self.corpusModel,
            'algorithm': self.algorithmWidget.value(),
            'context': self.variantsWidget.value(),
            'sequence_type': self.tierWidget.value(),
            'type_token': self.typeTokenWidget.value(),
            'frequency_cutoff': frequency_cutoff,
            'probability_type': self.probabilityTypeWidget.value()
        }

        if self.compType is None:
            reply = QMessageBox.critical(self, "Missing information",
                                         "Please specify a comparison type.")
            return
        elif self.compType == 'one':
            text = self.oneWordEdit.text()
            if not text:
                reply = QMessageBox.critical(self, "Missing information",
                                             "Please specify a word.")
                return
            try:
                w = self.corpusModel.corpus.find(text)
            except KeyError:
                reply = QMessageBox.critical(
                    self, "Invalid information",
                    "The spelling specified does match any words in the corpus."
                )
                return
            kwargs['query'] = [w]
        elif self.compType == 'nonword':
            if self.oneNonword is None:
                reply = QMessageBox.critical(self, "Missing information",
                                             "Please create a word/nonword.")
                return
            if not getattr(self.oneNonword, kwargs['sequence_type']):
                reply = QMessageBox.critical(
                    self, "Missing information",
                    "Please recreate the word/nonword with '{}' specified.".
                    format(self.tierWidget.displayValue()))
                return
            kwargs['query'] = [self.oneNonword]
        elif self.compType == 'file':
            path = self.fileWidget.value()
            if not path:
                reply = QMessageBox.critical(self, "Missing information",
                                             "Please enter a file path.")
                return
            if not os.path.exists(path):
                reply = QMessageBox.critical(
                    self, "Invalid information",
                    "The file path entered was not found.")
                return
            kwargs['query'] = list()
            text = load_words_neighden(path)
            for t in text:
                if isinstance(t, str):
                    try:
                        w = self.corpusModel.corpus.find(t)
                    except KeyError:
                        reply = QMessageBox.critical(
                            self, "Invalid information",
                            "The spelling '{}' was not found in the corpus.".
                            format(t))
                        return
                kwargs['query'].append(w)
        elif self.compType == 'all':
            column = self.columnEdit.text()
            if column == '':
                reply = QMessageBox.critical(self, "Missing information",
                                             "Please enter a column name.")
                return
            colName = column.replace(' ', '_')
            attribute = Attribute(colName, 'numeric', column)
            if column in self.corpusModel.columns:

                msgBox = QMessageBox(
                    QMessageBox.Warning, "Duplicate columns",
                    "'{}' is already the name of a column.  Overwrite?".format(
                        column), QMessageBox.NoButton, self)
                msgBox.addButton("Overwrite", QMessageBox.AcceptRole)
                msgBox.addButton("Cancel", QMessageBox.RejectRole)
                if msgBox.exec_() != QMessageBox.AcceptRole:
                    return
            kwargs['attribute'] = attribute

        return kwargs
Exemplo n.º 30
0
def transcription_text_to_data(corpus_name,
                               path,
                               annotation_types=None,
                               stop_check=None,
                               call_back=None):

    name = corpus_name

    if annotation_types is None:
        annotation_types = inspect_discourse_transcription(path)

    for a in annotation_types:
        a.reset()
    a = AnnotationType('Spelling',
                       None,
                       None,
                       attribute=Attribute('Spelling', 'spelling', 'Spelling'),
                       anchor=True)

    annotation_types.append(a)

    data = DiscourseData(name, annotation_types)

    lines = text_to_lines(path)
    if call_back is not None:
        call_back('Processing file...')
        call_back(0, len(lines))
        cur = 0
    trans_check = False
    n = 'Transcription'

    for line in lines:
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            cur += 1
            if cur % 20 == 0:
                call_back(cur)
        if not line or line == '\n':
            continue
        for word in line:
            annotations = dict()
            trans = parse_transcription(word, data[n])
            #if not trans_check and data[n].delimiter is not None and len(trans) > 1:
            #    trans_check = True
            spell = ''.join(x.label for x in trans)
            if spell == '':
                continue

            word = Annotation(spell)

            tier_elements = trans
            level_count = data.level_length(n)
            word.references.append(n)
            word.begins.append(level_count)
            word.ends.append(level_count + len(tier_elements))
            tier_elements[0].begin = level_count
            tier_elements[-1].end = level_count + len(tier_elements)
            annotations[n] = tier_elements
            annotations['Spelling'] = [word]
            data.add_annotations(**annotations)
    #if data[n].delimiter and not trans_check:
    #    raise(DelimiterError('The transcription delimiter specified does not create multiple segments. Please specify another delimiter.'))

    return data