Exemplo n.º 1
0
 def test_tokenize_exception_mode(self):
     """Does tokenize raise exception for unknown mode?"""
     with self.assertRaises(
             ValueError,
             msg="tokenize doesn't raise exception for unknown mode!"):
         Segmenter.tokenize(
             self.entire_text_seg,
             [(re.compile(r'\W+'), 'unknown_mode')],
         )
Exemplo n.º 2
0
 def test_tokenize_exception_mode(self):
     """Does tokenize raise exception for unknown mode?"""
     with self.assertRaises(
         ValueError,
         msg="tokenize doesn't raise exception for unknown mode!"
     ):
         Segmenter.tokenize(
             self.entire_text_seg,
             [(re.compile(r'\W+'), 'unknown_mode')],
         )
Exemplo n.º 3
0
    def test_tokenize_progress(self):
        """Does tokenize track progress?"""
        def progress_callback():
            """Mock progress callback"""
            self.count += 1

        Segmenter.tokenize(
            self.word_seg,
            [(re.compile(r'\w'), 'tokenize')],
            progress_callback=progress_callback,
        )
        self.assertEqual(self.count,
                         len(self.word_seg),
                         msg="tokenize doesn't track progress!")
Exemplo n.º 4
0
 def test_tokenize_import_annotations_false_split(self):
     """Does tokenize skip importing annotations (mode split)?"""
     segmentation = Segmenter.tokenize(self.word_seg,
                                       [(re.compile(r'a'), 'split')],
                                       import_annotations=False)
     self.assertFalse(
         'a' in segmentation[0].annotations,
         msg="tokenize doesn't skip importing annotations (mode split)!")
Exemplo n.º 5
0
    def test_tokenize_progress(self):
        """Does tokenize track progress?"""

        def progress_callback():
            """Mock progress callback"""
            self.count += 1

        Segmenter.tokenize(
            self.word_seg,
            [(re.compile(r'\w'), 'tokenize')],
            progress_callback=progress_callback,
        )
        self.assertEqual(
            self.count,
            len(self.word_seg),
            msg="tokenize doesn't track progress!"
        )
Exemplo n.º 6
0
 def test_tokenize_import_annotations_tokenize(self):
     """Does tokenize import annotations (mode tokenize)?"""
     segmentation = Segmenter.tokenize(self.word_seg,
                                       [(re.compile(r'\w{2}'), 'tokenize')],
                                       import_annotations=True)
     self.assertEqual(
         segmentation[0].annotations['a'],
         '1',
         msg="tokenize doesn't import annotations (mode tokenize)!")
Exemplo n.º 7
0
 def test_tokenize_import_annotations_split(self):
     """Does tokenize import annotations (mode split)?"""
     segmentation = Segmenter.tokenize(
         self.word_seg,
         [(re.compile(r'a'), 'split')],
     )
     self.assertEqual(
         segmentation[0].annotations['a'],
         '1',
         msg="tokenize doesn't import annotations (mode split)!")
Exemplo n.º 8
0
 def test_tokenize_autonumber(self):
     """Does tokenize autonumber input segments?"""
     segmentation = Segmenter.tokenize(self.word_seg, [
         (re.compile(r'\w+'), 'tokenize'),
         (re.compile(r'\W+'), 'split'),
     ],
                                       auto_number_as='num')
     self.assertEqual([s.annotations['num'] for s in segmentation],
                      [1, 2, 3, 4],
                      msg="tokenize doesn't autonumber input segments!")
Exemplo n.º 9
0
 def test_tokenize_import_annotations_false_split(self):
     """Does tokenize skip importing annotations (mode split)?"""
     segmentation = Segmenter.tokenize(
         self.word_seg,
         [(re.compile(r'a'), 'split')],
         import_annotations=False
     )
     self.assertFalse(
         'a' in segmentation[0].annotations,
         msg="tokenize doesn't skip importing annotations (mode split)!"
     )
Exemplo n.º 10
0
 def test_tokenize_create_static_annotations_split(self):
     """Does tokenize create static annotations (mode split)?"""
     segmentation = Segmenter.tokenize(
         self.word_seg,
         [(re.compile(r'\W'), 'split', {
             'c': '3'
         })],
     )
     self.assertEqual(
         [s.annotations['c'] for s in segmentation], ['3', '3'],
         msg="tokenize doesn't create static annotations (mode split)!")
Exemplo n.º 11
0
 def test_tokenize_create_static_annotations_split(self):
     """Does tokenize create static annotations (mode split)?"""
     segmentation = Segmenter.tokenize(
         self.word_seg,
         [(re.compile(r'\W'), 'split', {'c': '3'})],
     )
     self.assertEqual(
         [s.annotations['c'] for s in segmentation],
         ['3', '3'],
         msg="tokenize doesn't create static annotations (mode split)!"
     )
Exemplo n.º 12
0
 def test_tokenize_import_annotations_split(self):
     """Does tokenize import annotations (mode split)?"""
     segmentation = Segmenter.tokenize(
         self.word_seg,
         [(re.compile(r'a'), 'split')],
     )
     self.assertEqual(
         segmentation[0].annotations['a'],
         '1',
         msg="tokenize doesn't import annotations (mode split)!"
     )
Exemplo n.º 13
0
 def test_tokenize_sort(self):
     """Does tokenize sort output segments?"""
     segmentation = Segmenter.tokenize(
         self.word_seg,
         [
             (re.compile(r'\w'), 'tokenize'),
             (re.compile(r'[ae]'), 'tokenize'),
         ],
     )
     self.assertEqual([s.get_content() for s in segmentation],
                      ['a', 'a', 'b', 'c', 'd', 'e', 'e'],
                      msg="tokenize doesn't sort output segments!")
Exemplo n.º 14
0
 def test_tokenize_segment_split(self):
     """Does tokenize split input?"""
     segmentation = Segmenter.tokenize(
         self.entire_text_seg,
         [
             (re.compile(r'\W+'), 'split'),
             (re.compile(r'd'), 'split'),
         ],
     )
     self.assertEqual([s.get_content() for s in segmentation],
                      ['ab', 'ab c', 'cde', 'e'],
                      msg="tokenize doesn't split input!")
Exemplo n.º 15
0
 def test_tokenize_import_annotations_tokenize(self):
     """Does tokenize import annotations (mode tokenize)?"""
     segmentation = Segmenter.tokenize(
         self.word_seg,
         [(re.compile(r'\w{2}'), 'tokenize')],
         import_annotations=True
     )
     self.assertEqual(
         segmentation[0].annotations['a'],
         '1',
         msg="tokenize doesn't import annotations (mode tokenize)!"
     )
Exemplo n.º 16
0
 def test_tokenize_segment_tokenize(self):
     """Does tokenize tokenize input?"""
     segmentation = Segmenter.tokenize(
         self.entire_text_seg,
         [
             (re.compile(r'\w+'), 'tokenize'),
             (re.compile(r'\w{3,}'), 'tokenize'),
         ],
     )
     self.assertEqual([s.get_content() for s in segmentation],
                      ['ab', 'cde', 'cde'],
                      msg="tokenize doesn't tokenize input!")
Exemplo n.º 17
0
 def test_tokenize_create_dynamic_annotations_tokenize(self):
     """Does tokenize create dynamic annotations (mode tokenize)?"""
     segmentation = Segmenter.tokenize(
         self.word_seg,
         [
             (re.compile(r'\w(\w)(\w)'), 'tokenize', {'&1': '&2'}),
         ],
     )
     self.assertEqual(
         segmentation[0].annotations['d'],
         'e',
         msg="tokenize doesn't create dynamic annotations (mode tokenize)!"
     )
Exemplo n.º 18
0
 def test_tokenize_merge_duplicates(self):
     """Does tokenize merge duplicates?"""
     segmentation = Segmenter.tokenize(
         self.word_seg,
         [
             (re.compile(r'\w+'), 'tokenize'),
             (re.compile(r'\W+'), 'split'),
         ],
         merge_duplicates=True,
     )
     self.assertEqual([s.get_content() for s in segmentation],
                      ['ab', 'cde'],
                      msg="tokenize doesn't merge duplicates!")
Exemplo n.º 19
0
 def test_tokenize_create_dynamic_annotations_tokenize(self):
     """Does tokenize create dynamic annotations (mode tokenize)?"""
     segmentation = Segmenter.tokenize(
         self.word_seg,
         [
             (re.compile(r'\w(\w)(\w)'), 'tokenize', {
                 '&1': '&2'
             }),
         ],
     )
     self.assertEqual(
         segmentation[0].annotations['d'],
         'e',
         msg="tokenize doesn't create dynamic annotations (mode tokenize)!")
Exemplo n.º 20
0
 def test_tokenize_segment_split(self):
     """Does tokenize split input?"""
     segmentation = Segmenter.tokenize(
         self.entire_text_seg,
         [
             (re.compile(r'\W+'), 'split'),
             (re.compile(r'd'), 'split'),
         ],
     )
     self.assertEqual(
         [s.get_content() for s in segmentation],
         ['ab', 'ab c', 'cde', 'e'],
         msg="tokenize doesn't split input!"
     )
Exemplo n.º 21
0
 def test_tokenize_sort(self):
     """Does tokenize sort output segments?"""
     segmentation = Segmenter.tokenize(
         self.word_seg,
         [
             (re.compile(r'\w'), 'tokenize'),
             (re.compile(r'[ae]'), 'tokenize'),
         ],
     )
     self.assertEqual(
         [s.get_content() for s in segmentation],
         ['a', 'a', 'b', 'c', 'd', 'e', 'e'],
         msg="tokenize doesn't sort output segments!"
     )
Exemplo n.º 22
0
 def test_tokenize_segment_tokenize(self):
     """Does tokenize tokenize input?"""
     segmentation = Segmenter.tokenize(
         self.entire_text_seg,
         [
             (re.compile(r'\w+'), 'tokenize'),
             (re.compile(r'\w{3,}'), 'tokenize'),
         ],
     )
     self.assertEqual(
         [s.get_content() for s in segmentation],
         ['ab', 'cde', 'cde'],
         msg="tokenize doesn't tokenize input!"
     )
Exemplo n.º 23
0
 def test_tokenize_autonumber(self):
     """Does tokenize autonumber input segments?"""
     segmentation = Segmenter.tokenize(
         self.word_seg,
         [
             (re.compile(r'\w+'), 'tokenize'),
             (re.compile(r'\W+'), 'split'),
         ],
         auto_number_as='num'
     )
     self.assertEqual(
         [s.annotations['num'] for s in segmentation],
         [1, 2, 3, 4],
         msg="tokenize doesn't autonumber input segments!"
     )
Exemplo n.º 24
0
 def test_tokenize_solve_conflicts_merge_duplicates(self):
     """Does tokenize solve conflicts when merging duplicates?"""
     segmentation = Segmenter.tokenize(
         self.word_seg,
         [
             (re.compile(r'\w+'), 'tokenize', {'a': '10'}),
             (re.compile(r'\W+'), 'split', {'a': '20'}),
         ],
         merge_duplicates=True,
     )
     self.assertEqual(
         segmentation[1].annotations['a'],
         '20',
         msg="tokenize doesn't solve conflicts when merging duplicates!"
     )
Exemplo n.º 25
0
 def test_tokenize_merge_duplicates(self):
     """Does tokenize merge duplicates?"""
     segmentation = Segmenter.tokenize(
         self.word_seg,
         [
             (re.compile(r'\w+'), 'tokenize'),
             (re.compile(r'\W+'), 'split'),
         ],
         merge_duplicates=True,
     )
     self.assertEqual(
         [s.get_content() for s in segmentation],
         ['ab', 'cde'],
         msg="tokenize doesn't merge duplicates!"
     )
Exemplo n.º 26
0
def main():

    input_seg = Input("un texte")

    verbatim_seg = Segmenter.tokenize(
        input_seg,
        [(re.compile(r'.+'), 'tokenize')],
    )

    # verbatim in input = ok
    print("verbatim in input:", end=' ')
    contained_segment_idxs = input_seg[0].get_contained_segment_indices(
        verbatim_seg)
    try:
        print("ok" if verbatim_seg[contained_segment_idxs[0]].get_content() ==
              'un texte' else "fail")
    except:
        print("fail")

    # verbatim in verbatim = ok
    print("verbatim in verbatim:", end=' ')
    contained_segment_idxs = verbatim_seg[0].get_contained_segment_indices(
        verbatim_seg)
    try:
        print("ok" if verbatim_seg[contained_segment_idxs[0]].get_content() ==
              'un texte' else "fail")
    except:
        print("fail")

    # input in verbatim = fail
    print("input in verbatim:", end=' ')
    contained_segment_idxs = verbatim_seg[0].get_contained_segment_indices(
        input_seg)
    try:
        print("ok" if input_seg[contained_segment_idxs[0]].get_content() ==
              'un texte' else "fail")
    except:
        print("fail")

    # input in input = fail
    print("input in input:", end=' ')
    contained_segment_idxs = input_seg[0].get_contained_segment_indices(
        input_seg)
    try:
        print("ok" if input_seg[contained_segment_idxs[0]].get_content() ==
              'un texte' else "fail")
    except:
        print("fail")
Exemplo n.º 27
0
 def test_tokenize_solve_conflicts_merge_duplicates(self):
     """Does tokenize solve conflicts when merging duplicates?"""
     segmentation = Segmenter.tokenize(
         self.word_seg,
         [
             (re.compile(r'\w+'), 'tokenize', {
                 'a': '10'
             }),
             (re.compile(r'\W+'), 'split', {
                 'a': '20'
             }),
         ],
         merge_duplicates=True,
     )
     self.assertEqual(
         segmentation[1].annotations['a'],
         '20',
         msg="tokenize doesn't solve conflicts when merging duplicates!")
Exemplo n.º 28
0
def main():

    input_seg = Input("un texte")

    verbatim_seg = Segmenter.tokenize(
        input_seg,
        [(re.compile(r'.+'), 'tokenize')],
    )

    # verbatim in input = ok
    print "verbatim in input:",
    contained_segments = input_seg[0].get_contained_segments(verbatim_seg)
    try:
        print "ok" if contained_segments[0].get_content(
        ) == 'un texte' else "fail"
    except:
        print "fail"

    # verbatim in verbatim = ok
    print "verbatim in verbatim:",
    contained_segments = verbatim_seg[0].get_contained_segments(verbatim_seg)
    try:
        print "ok" if contained_segments[0].get_content(
        ) == 'un texte' else "fail"
    except:
        print "fail"

    # input in verbatim = fail
    print "input in verbatim:",
    contained_segments = verbatim_seg[0].get_contained_segments(input_seg)
    try:
        print "ok" if contained_segments[0].get_content(
        ) == 'un texte' else "fail"
    except:
        print "fail"

    # input in input = fail
    print "input in input:",
    contained_segments = input_seg[0].get_contained_segments(input_seg)
    try:
        print "ok" if contained_segments[0].get_content(
        ) == 'un texte' else "fail"
    except:
        print "fail"
def main():

    input_seg = Input("un texte")

    verbatim_seg = Segmenter.tokenize(
        input_seg,
        [(re.compile(r'.+'), 'tokenize')],
    )

    # verbatim in input = ok
    print "verbatim in input:",
    contained_segments = input_seg[0].get_contained_segments(verbatim_seg)
    try:
        print "ok" if contained_segments[0].get_content() == 'un texte' else "fail"
    except:
        print "fail"

    # verbatim in verbatim = ok
    print "verbatim in verbatim:",
    contained_segments = verbatim_seg[0].get_contained_segments(verbatim_seg)
    try:
        print "ok" if contained_segments[0].get_content() == 'un texte' else "fail"
    except:
        print "fail"

    # input in verbatim = fail
    print "input in verbatim:",
    contained_segments = verbatim_seg[0].get_contained_segments(input_seg)
    try:
        print "ok" if contained_segments[0].get_content() == 'un texte' else "fail"
    except:
        print "fail"

    # input in input = fail
    print "input in input:",
    contained_segments = input_seg[0].get_contained_segments(input_seg)
    try:
        print "ok" if contained_segments[0].get_content() == 'un texte' else "fail"
    except:
        print "fail"
Exemplo n.º 30
0
from LTTL.Input import Input
import LTTL.Segmenter as Segmenter
import re

input_seg = Input("un texte")

word_seg = Segmenter.tokenize(
    input_seg,
    [(re.compile(r'\w+'), 'tokenize')],
)

vowel_seg = Segmenter.tokenize(
    input_seg,
    [(re.compile(r'[aeiouy]'), 'tokenize')],
)

for seg in word_seg[1].get_contained_segments(vowel_seg):
    print(seg.get_content())
Exemplo n.º 31
0
    def getTitleListFromECP(self):
        """Fetch titles from the ECP website"""

        self.infoBox.customMessage(
            "Fetching data from ECP website, please wait")

        # Attempt to connect to ECP...
        try:
            response = urllib.request.urlopen(self.base_url)
            base_html = response.read().decode('utf-8')
            self.infoBox.customMessage("Done fetching data from ECP website.")

        # If unable to connect (somehow)...
        except:

            # Set Info box and widget to "warning" state.
            self.infoBox.noDataSent(warning="Couldn't access ECP website.")

            # Empty title list box.
            self.titleLabels = list()

            # Reset output channel.
            self.send("XML-TEI data", None, self)
            return None

        # Otherwise store HTML content in LTTL Input object.
        base_html_seg = Input(base_html)

        # Remove accents from the data...
        recoded_seg, _ = Segmenter.recode(base_html_seg, remove_accents=True)

        # Extract table containing titles...
        genresListSeg = Segmenter.import_xml(
            segmentation=recoded_seg,
            element="ul",
            conditions={"id": re.compile(r"^genres-list")},
        )

        # Extract genre annotation...
        genreSeg = Segmenter.tokenize(
            segmentation=genresListSeg,
            regexes=[(re.compile(r'<a id[^>]+>(.+?)</a.+?(?=<a id|$)(?s)'), \
            "tokenize", {"genre": "&1"})],
            import_annotations=False,
        )

        # Extract works...
        titleSeg = Segmenter.tokenize(
            segmentation=genreSeg,
            regexes=[(re.compile(r'<li class="bibl".+?</span>(?s)'), \
            "tokenize")],
        )

        # Extract annotations...
        titleSeg = Segmenter.tokenize(
            segmentation=titleSeg,
            regexes=[
                (re.compile(r"^.*>\n(.+?)</span>.*$(?s)"), "tokenize", {
                    "author": "&1"
                }),
                (re.compile(r'^.*href="(/works/.+?\.shtml)">.*$(?s)'),
                 "tokenize", {
                     "url": "&1"
                 }),
                (re.compile(r'^.*shtml">(.*)</a>.*$(?s)'), "tokenize", {
                    "title": "&1"
                }),
            ],
            merge_duplicates=True,
        )

        # Try to save list in this module"s directory for future reference...
        path = os.path.dirname(
            os.path.abspath(inspect.getfile(inspect.currentframe())))
        try:
            file = open(os.path.join(path, "cached_title_list_ecp"), "wb")
            pickle.dump(titleSeg, file, -1)
            file.close()
        except IOError:
            pass

        # Remove warning (if any)...
        self.error(0)
        self.warning(0)

        return titleSeg
Exemplo n.º 32
0
from LTTL.Input import Input
import LTTL.Segmenter as Segmenter
import re

input_seg = Input("un texte")

word_seg = Segmenter.tokenize(
    input_seg,
    [(re.compile(r'\w+'), 'tokenize')],
)

consonant_seg = Segmenter.tokenize(
    input_seg,
    [(re.compile(r'[^aeiouy]'), 'tokenize')],
)

# Prints nothing (though 'n' is in 'un'
for seg in word_seg[0].get_contained_segments(consonant_seg):
    print(seg.get_content())
Exemplo n.º 33
0
    import sys, re
    from PyQt4.QtGui import  QApplication
    import LTTL.Segmenter as Segmenter
    from LTTL.Input import Input

    appl = QApplication(sys.argv)
    ow = OWTextableVariety()
    seg1 = Input(u'aabccc', 'text1')
    seg2 = Input(u'abci', 'text2')
    seg3 = Segmenter.concatenate(
        [seg1, seg2],
        import_labels_as='string',
        label='corpus'
    )
    seg4 = Segmenter.tokenize(
        seg3,
        regexes=[(re.compile(r'\w+'), u'tokenize',)],
    )
    seg5 = Segmenter.tokenize(
        seg4,
        regexes=[(re.compile(r'[ai]'), u'tokenize',)],
        label='V'
    )
    seg6 = Segmenter.tokenize(
        seg4,
        regexes=[(re.compile(r'[bc]'), u'tokenize',)],
        label='C'
    )
    seg7 = Segmenter.concatenate(
        [seg5, seg6],
        import_labels_as='category',
        label='letters',

if __name__ == '__main__':
    import sys
    import re

    from PyQt4.QtGui import QApplication
    from LTTL.Input import Input

    appl = QApplication(sys.argv)
    ow = OWTextableIntersect()
    seg1 = Input(u'hello world', 'text')
    seg2 = Segmenter.tokenize(
        seg1,
        [
            (re.compile(r'hello'), u'tokenize', {'tag': 'interj'}),
            (re.compile(r'world'), u'tokenize', {'tag': 'noun'}),
        ],
        label='words',
    )
    seg3 = Segmenter.tokenize(
        seg2,
        [(re.compile(r'[aeiou]'), u'tokenize')],
        label='V'
    )
    seg4 = Segmenter.tokenize(
        seg2,
        [(re.compile(r'[hlwrdc]'), u'tokenize')],
        label='C'
    )
    seg5 = Segmenter.tokenize(
        seg2,
    def getTitleListFromTheatreClassique(self):
        """Fetch titles from the Theatre-classique website"""

        self.infoBox.customMessage(
            "Fetching data from Theatre-classique website, please wait"
        )
        
        # Attempt to connect to Theatre-classique...
        try:
            response = urllib.request.urlopen(self.base_url)
            base_html = response.read().decode('iso-8859-1')
            self.infoBox.customMessage(
                "Done fetching data from Theatre-classique website."
            )

        # If unable to connect (somehow)...
        except:

            # Set Info box and widget to "warning" state.
            self.infoBox.noDataSent(
                warning="Couldn't access theatre-classique website."
            )

            # Empty title list box.
            self.titleLabels = list()

            # Reset output channel.
            self.send("XML-TEI data", None, self)
            return None
            
        # Otherwise store HTML content in LTTL Input object.
        base_html_seg = Input(base_html)

        # Remove accents from the data...
        recoded_seg = Segmenter.recode(base_html_seg, remove_accents=True)

        # Extract table containing titles from HTML.
        table_seg = Segmenter.import_xml(
            segmentation=recoded_seg,
            element="table",
            conditions={"id": re.compile(r"^table_AA$")},
        )

        # Extract table lines.
        line_seg = Segmenter.import_xml(
            segmentation=table_seg,
            element="tr",
        )

        # Compile the regex that will be used to parse each line.
        field_regex = re.compile(
            r"^\s*<td>\s*<a.+?>(.+?)</a>\s*</td>\s*"
            r"<td>(.+?)</td>\s*"
            r"<td.+?>\s*<a.+?>\s*(\d+?)\s*</a>\s*</td>\s*"
            r"<td.+?>\s*(.+?)\s*</td>\s*"
            r"<td.+?>\s*<a\s+.+?t=\.{2}/(.+?)'>\s*HTML"
        )

        # Parse each line and store the resulting segmentation in an attribute.
        titleSeg = Segmenter.tokenize(
            segmentation=line_seg,
            regexes=[
                (field_regex, "tokenize", {"author": "&1"}),
                (field_regex, "tokenize", {"title": "&2"}),
                (field_regex, "tokenize", {"year": "&3"}),
                (field_regex, "tokenize", {"genre": "&4"}),
                (field_regex, "tokenize", {"url": "&5"}),
            ],
            import_annotations=False,
            merge_duplicates=True,
        )

        # Try to save list in this module"s directory for future reference...
        path = os.path.dirname(
            os.path.abspath(inspect.getfile(inspect.currentframe()))
        )
        try:
            file = open(os.path.join(path, "cached_title_list"), "wb")
            pickle.dump(titleSeg, file, -1) 
            file.close()         
        except IOError:
            pass

        # Remove warning (if any)...
        self.error(0)
        self.warning(0)
        
        return titleSeg
Exemplo n.º 36
0
    def sendData(self):
        """(Have LTTL.Segmenter) perform the actual tokenization"""

        # Check that there's something on input...
        if not self.inputSegmentation:
            self.infoBox.setText(u'Widget needs input.', 'warning')
            self.send('Segmented data', None, self)
            return

        # Check that there's at least one regex (if needed)...
        if ((self.displayAdvancedSettings and not self.regexes)
                or (self.segmentType == 'Use a regular expression'
                    and not (self.regex or self.displayAdvancedSettings))):
            self.infoBox.setText(u'Please enter a regex.', 'warning')
            self.send('Segmented data', None, self)
            return

        # Get regexes from basic or advanced settings...
        regexForType = {
            u'Segment into letters': r'\w',
            u'Segment into words': r'\w+',
            u'Segment into lines': r'.+',
        }
        if self.displayAdvancedSettings:
            myRegexes = self.regexes
        elif self.segmentType == 'Use a regular expression':
            myRegexes = [[
                self.regex,
                None,
                None,
                False,
                True,
                False,
                False,
                u'tokenize',
            ]]
        else:
            myRegexes = [[
                regexForType[self.segmentType],
                None,
                None,
                False,
                True,
                False,
                False,
                u'tokenize',
            ]]

        # TODO: remove message 'No label was provided.' from docs

        if self.displayAdvancedSettings:
            importAnnotations = self.importAnnotations
            if self.autoNumber:
                autoNumberKey = self.autoNumberKey
                if autoNumberKey == '':
                    self.infoBox.setText(
                        u'Please enter an annotation key for auto-numbering.',
                        'warning')
                    self.send('Segmented data', None, self)
                    return
            else:
                autoNumberKey = None
            mergeDuplicates = self.mergeDuplicates
        else:
            importAnnotations = True
            autoNumberKey = None
            mergeDuplicates = False

        # Prepare regexes...
        regexes = list()
        for regex_idx in range(len(myRegexes)):
            regex = myRegexes[regex_idx]
            regex_string = regex[0]
            if regex[3] or regex[4] or regex[5] or regex[6]:
                flags = ''
                if regex[3]:
                    flags += 'i'
                if regex[4]:
                    flags += 'u'
                if regex[5]:
                    flags += 'm'
                if regex[6]:
                    flags += 's'
                regex_string += '(?%s)' % flags
            try:
                if regex[1] and regex[2]:
                    regexes.append(
                        (re.compile(regex_string), (regex[7].lower()), {
                            regex[1]: regex[2]
                        }))
                else:
                    regexes.append(
                        (re.compile(regex_string), regex[7].lower()))
            except re.error as re_error:
                try:
                    message = u'Please enter a valid regex (error: %s' %    \
                              re_error.msg
                    if self.displayAdvancedSettings and len(myRegexes) > 1:
                        message += u', regex #%i' % (regex_idx + 1)
                    message += u').'
                except AttributeError:
                    message = u'Please enter a valid regex'
                    if self.displayAdvancedSettings and len(myRegexes) > 1:
                        message += u' (regex #%i)' % (regex_idx + 1)
                    message += u'.'
                self.infoBox.setText(message, 'error')
                self.send('Segmented data', None, self)
                return

        # Perform tokenization...
        self.controlArea.setDisabled(True)
        self.infoBox.setText(u"Processing, please wait...", "warning")
        progressBar = ProgressBar(self,
                                  iterations=len(self.inputSegmentation) *
                                  len(myRegexes))
        self.warning()
        self.error()
        try:
            segmented_data = Segmenter.tokenize(
                segmentation=self.inputSegmentation,
                regexes=regexes,
                label=self.captionTitle,
                import_annotations=importAnnotations,
                merge_duplicates=mergeDuplicates,
                auto_number_as=autoNumberKey,
                progress_callback=progressBar.advance,
            )
            message = u'%i segment@p sent to output.' % len(segmented_data)
            message = pluralize(message, len(segmented_data))
            self.infoBox.setText(message)
            self.send('Segmented data', segmented_data, self)
        except IndexError:
            self.infoBox.setText(
                u'Reference to unmatched group in annotation key and/or value.',
                'error')
            self.send('Segmented data', None, self)
        self.sendButton.resetSettingsChangedFlag()
        progressBar.finish()
        self.controlArea.setDisabled(False)
Exemplo n.º 37
0
    def setUp(self):
        self.maxDiff = None
        input_seg = Input("un texte")
        word_seg = Segmenter.tokenize(
            input_seg,
            [(re.compile(r'\w+'), 'tokenize')],
            import_annotations=False,
        )
        letter_seg = Segmenter.tokenize(
            input_seg,
            [
                (re.compile(r'\w'), 'tokenize', {
                    'type': 'C'
                }),
                (re.compile(r'[aeiouy]'), 'tokenize', {
                    'type': 'V'
                }),
            ],
            import_annotations=False,
            merge_duplicates=True,
        )
        vowel_seg, consonant_seg = Segmenter.select(
            letter_seg,
            re.compile(r'V'),
            annotation_key='type',
        )

        #  Create the cooccurrence matrix for cooccurrence in window
        #  with window_size=3 and without annotation (woa):
        self.window_woa_row_ids = ['u', 'n', 't', 'e', 'x']
        self.window_woa_col_ids = ['u', 'n', 't', 'e', 'x']
        self.window_woa_values = {
            ('u', 'u'): 1,
            ('u', 'n'): 1,
            ('u', 't'): 1,
            ('u', 'e'): 0,
            ('u', 'x'): 0,
            ('n', 'u'): 1,
            ('n', 'n'): 2,
            ('n', 't'): 2,
            ('n', 'e'): 1,
            ('n', 'x'): 0,
            ('t', 'u'): 1,
            ('t', 'n'): 2,
            ('t', 't'): 5,
            ('t', 'e'): 4,
            ('t', 'x'): 3,
            ('e', 'u'): 0,
            ('e', 'n'): 1,
            ('e', 't'): 4,
            ('e', 'e'): 4,
            ('e', 'x'): 3,
            ('x', 'u'): 0,
            ('x', 'n'): 0,
            ('x', 't'): 3,
            ('x', 'e'): 3,
            ('x', 'x'): 3,
        }
        self.window_woa_header_row_id = '__unit__'
        self.window_woa_header_row_type = 'string'
        self.window_woa_header_col_id = '__unit__'
        self.window_woa_header_col_type = 'string'
        self.window_woa_col_type = {
            col_id: 'continuous'
            for col_id in self.window_woa_col_ids
        }
        self.window_woa_ref = IntPivotCrosstab(
            self.window_woa_row_ids,
            self.window_woa_col_ids,
            self.window_woa_values,
            self.window_woa_header_row_id,
            self.window_woa_header_row_type,
            self.window_woa_header_col_id,
            self.window_woa_header_col_type,
            self.window_woa_col_type,
        )
        #  Create the cooccurrence matrix for cooccurrence in window
        #  with window_size=3 and with annotation (wa):
        self.window_wa_row_ids = ['C', 'V']
        self.window_wa_col_ids = ['C', 'V']
        self.window_wa_values = {
            ('C', 'C'): 5,
            ('C', 'V'): 5,
            ('V', 'C'): 5,
            ('V', 'V'): 5,
        }
        self.window_wa_header_row_id = '__unit__'
        self.window_wa_header_row_type = 'string'
        self.window_wa_header_col_id = '__unit__'
        self.window_wa_header_col_type = 'string'
        self.window_wa_col_type = {
            col_id: 'continuous'
            for col_id in self.window_wa_col_ids
        }
        self.window_wa_ref = IntPivotCrosstab(
            self.window_wa_row_ids,
            self.window_wa_col_ids,
            self.window_wa_values,
            self.window_wa_header_row_id,
            self.window_wa_header_row_type,
            self.window_wa_header_col_id,
            self.window_wa_header_col_type,
            self.window_wa_col_type,
        )
        # Create the cooccurrence matrix for cooccurrence in context
        # without the secondary unit (wos) and without annotation (woa):
        self.context_wos_woa_row_ids = ['u', 'n', 't', 'e', 'x']
        self.context_wos_woa_col_ids = ['u', 'n', 't', 'e', 'x']
        self.context_wos_woa_values = {
            ('u', 'u'): 1,
            ('u', 'n'): 1,
            ('u', 't'): 0,
            ('u', 'e'): 0,
            ('u', 'x'): 0,
            ('n', 'u'): 1,
            ('n', 'n'): 1,
            ('n', 't'): 0,
            ('n', 'e'): 0,
            ('n', 'x'): 0,
            ('t', 'u'): 0,
            ('t', 'n'): 0,
            ('t', 't'): 1,
            ('t', 'e'): 1,
            ('t', 'x'): 1,
            ('e', 'u'): 0,
            ('e', 'n'): 0,
            ('e', 't'): 1,
            ('e', 'e'): 1,
            ('e', 'x'): 1,
            ('x', 'u'): 0,
            ('x', 'n'): 0,
            ('x', 't'): 1,
            ('x', 'e'): 1,
            ('x', 'x'): 1,
        }
        self.context_wos_woa_header_row_id = '__context__'
        self.context_wos_woa_header_row_type = 'string'
        self.context_wos_woa_header_col_id = '__context__'
        self.context_wos_woa_header_col_type = 'string'
        self.context_wos_woa_col_type = {
            col_id: 'continuous'
            for col_id in self.context_wos_woa_col_ids
        }
        self.context_wos_woa_ref = IntPivotCrosstab(
            self.context_wos_woa_row_ids,
            self.context_wos_woa_col_ids,
            self.context_wos_woa_values,
            self.context_wos_woa_header_row_id,
            self.context_wos_woa_header_row_type,
            self.context_wos_woa_header_col_id,
            self.context_wos_woa_header_col_type,
            self.context_wos_woa_col_type,
        )
        # Create the cooccurrence matrix for cooccurrence in context
        # without the secondary unit (wos) and with annotation (wa):
        self.context_wos_wa_row_ids = ['V', 'C']
        self.context_wos_wa_col_ids = ['V', 'C']
        self.context_wos_wa_values = {
            ('V', 'V'): 2,
            ('V', 'C'): 2,
            ('C', 'V'): 2,
            ('C', 'C'): 2,
        }
        self.context_wos_wa_header_row_id = '__context__'
        self.context_wos_wa_header_row_type = 'string'
        self.context_wos_wa_header_col_id = '__context__'
        self.context_wos_wa_header_col_type = 'string'
        self.context_wos_wa_col_type = {
            col_id: 'continuous'
            for col_id in self.context_wos_wa_col_ids
        }
        self.context_wos_wa_ref = IntPivotCrosstab(
            self.context_wos_wa_row_ids,
            self.context_wos_wa_col_ids,
            self.context_wos_wa_values,
            self.context_wos_wa_header_row_id,
            self.context_wos_wa_header_row_type,
            self.context_wos_wa_header_col_id,
            self.context_wos_wa_header_col_type,
            self.context_wos_wa_col_type,
        )
        # Create the cooccurrence matrix for cooccurrence in context
        # with the secondary unit (ws) and without annotation (woa):
        self.context_ws_woa_col_ids = ['u', 'e']
        self.context_ws_woa_row_ids = ['n', 't', 'x']
        self.context_ws_woa_values = {
            ('n', 'u'): 1,
            ('n', 'e'): 0,
            ('t', 'u'): 0,
            ('t', 'e'): 1,
            ('x', 'u'): 0,
            ('x', 'e'): 1,
        }
        self.context_ws_woa_header_row_id = '__context__'
        self.context_ws_woa_header_row_type = 'string'
        self.context_ws_woa_header_col_id = '__context__'
        self.context_ws_woa_header_col_type = 'string'
        self.context_ws_woa_col_type = {
            col_id: 'continuous'
            for col_id in self.context_ws_woa_col_ids
        }
        self.context_ws_woa_ref = IntPivotCrosstab(
            self.context_ws_woa_row_ids,
            self.context_ws_woa_col_ids,
            self.context_ws_woa_values,
            self.context_ws_woa_header_row_id,
            self.context_ws_woa_header_row_type,
            self.context_ws_woa_header_col_id,
            self.context_ws_woa_header_col_type,
            self.context_ws_woa_col_type,
        )
        # Create the cooccurrence matrix for cooccurrence in context
        # with the secondary unit (ws) and with annotation (wa):
        self.context_ws_wa_row_ids = ['C']
        self.context_ws_wa_col_ids = ['V']
        self.context_ws_wa_values = {
            ('C', 'V'): 2,
        }
        self.context_ws_wa_header_row_id = '__context__'
        self.context_ws_wa_header_row_type = 'string'
        self.context_ws_wa_header_col_id = '__context__'
        self.context_ws_wa_header_col_type = 'string'
        self.context_ws_wa_col_type = {
            col_id: 'continuous'
            for col_id in self.context_ws_wa_col_ids
        }
        self.context_ws_wa_ref = IntPivotCrosstab(
            self.context_ws_wa_row_ids,
            self.context_ws_wa_col_ids,
            self.context_ws_wa_values,
            self.context_ws_wa_header_row_id,
            self.context_ws_wa_header_row_type,
            self.context_ws_wa_header_col_id,
            self.context_ws_wa_header_col_type,
            self.context_ws_wa_col_type,
        )
        self.output_cooc_in_window_woa = Processor.cooc_in_window(
            units={'segmentation': letter_seg},
            window_size=3,
        )
        self.output_cooc_in_window_wa = Processor.cooc_in_window(
            units={
                'segmentation': letter_seg,
                'annotation_key': 'type'
            },
            window_size=3,
        )
        self.output_cooc_in_context_wos_woa = Processor.cooc_in_context(
            units={'segmentation': letter_seg},
            contexts={'segmentation': word_seg},
            units2=None,
        )
        self.output_cooc_in_context_wos_wa = Processor.cooc_in_context(
            units={
                'segmentation': letter_seg,
                'annotation_key': 'type'
            },
            contexts={'segmentation': word_seg},
            units2=None,
        )
        self.output_cooc_in_context_ws_woa = Processor.cooc_in_context(
            units={'segmentation': vowel_seg},
            contexts={'segmentation': word_seg},
            units2={'segmentation': consonant_seg},
        )
        self.output_cooc_in_context_ws_wa = Processor.cooc_in_context(
            units={
                'segmentation': vowel_seg,
                'annotation_key': 'type'
            },
            contexts={'segmentation': word_seg},
            units2={
                'segmentation': consonant_seg,
                'annotation_key': 'type'
            },
        )
    def getTitleListFromTheatreClassique(self):
        """Fetch titles from the Theatre-classique website"""

        self.infoBox.customMessage(
            u'Fetching data from Theatre-classique website, please wait')

        # Attempt to connect to Theatre-classique...
        try:
            response = urllib2.urlopen(self.base_url)
            base_html = unicode(response.read(), 'iso-8859-1')
            self.infoBox.customMessage(
                u'Done fetching data from Theatre-classique website.')

        # If unable to connect (somehow)...
        except:

            # Set Info box and widget to 'warning' state.
            self.infoBox.noDataSent(
                warning=u"Couldn't access theatre-classique website.")

            # Empty title list box.
            self.titleLabels = list()

            # Reset output channel.
            self.send(u'Text data', None, self)
            return None

        # Otherwise store HTML content in LTTL Input object.
        base_html_seg = Input(base_html)

        # Remove accents from the data...
        recoded_seg = Segmenter.recode(base_html_seg, remove_accents=True)

        # Extract table containing titles from HTML.
        table_seg = Segmenter.import_xml(
            segmentation=recoded_seg,
            element=u'table',
            conditions={u'id': re.compile(ur'^table_AA$')},
        )

        # Extract table lines.
        line_seg = Segmenter.import_xml(
            segmentation=table_seg,
            element=u'tr',
        )

        # Compile the regex that will be used to parse each line.
        field_regex = re.compile(
            ur"^\s*<td>\s*<a.+?>(.+?)</a>\s*</td>\s*"
            ur"<td>(.+?)</td>\s*"
            ur"<td.+?>\s*<a.+?>\s*(\d+?)\s*</a>\s*</td>\s*"
            ur"<td.+?>\s*(.+?)\s*</td>\s*"
            ur"<td.+?>\s*<a\s+.+?t=\.{2}/(.+?)'>\s*HTML")

        # Parse each line and store the resulting segmentation in an attribute.
        titleSeg = Segmenter.tokenize(
            segmentation=line_seg,
            regexes=[
                (field_regex, u'tokenize', {
                    u'author': u'&1'
                }),
                (field_regex, u'tokenize', {
                    u'title': u'&2'
                }),
                (field_regex, u'tokenize', {
                    u'year': u'&3'
                }),
                (field_regex, u'tokenize', {
                    u'genre': u'&4'
                }),
                (field_regex, u'tokenize', {
                    u'url': u'&5'
                }),
            ],
            import_annotations=False,
            merge_duplicates=True,
        )

        # Try to save list in this module's directory for future reference...
        path = os.path.dirname(
            os.path.abspath(inspect.getfile(inspect.currentframe())))
        try:
            file = open(os.path.join(path, u"cached_title_list"), u'wb')
            pickle.dump(titleSeg, file, -1)
            file.close()
        except IOError:
            pass

        # Remove warning (if any)...
        self.error(0)
        self.warning(0)

        return titleSeg
Exemplo n.º 39
0
    def setUp(self):
        input_seg = Input("un texte")
        word_seg = Segmenter.tokenize(
            input_seg,
            [(re.compile(r'\w+'), 'tokenize')],
            import_annotations=False,
        )
        letter_seg = Segmenter.tokenize(
            input_seg,
            [
                (re.compile(r'\w'), 'tokenize', {'type': 'C'}),
                (re.compile(r'[aeiouy]'), 'tokenize', {'type': 'V'}),
            ],
            import_annotations=False,
            merge_duplicates=True,
        )
        vowel_seg, consonant_seg = Segmenter.select(
            letter_seg,
            re.compile(r'V'),
            annotation_key='type',
        )

        #  Create the cooccurrence matrix for cooccurrence in window
        #  with window_size=3 and without annotation (woa):
        self.window_woa_row_ids = ['u', 'n', 't', 'e', 'x']
        self.window_woa_col_ids = ['u', 'n', 't', 'e', 'x']
        self.window_woa_values = {
            ('u', 'u'): 1,
            ('u', 'n'): 1,
            ('u', 't'): 1,
            ('u', 'e'): 0,
            ('u', 'x'): 0,
            ('n', 'u'): 1,
            ('n', 'n'): 2,
            ('n', 't'): 2,
            ('n', 'e'): 1,
            ('n', 'x'): 0,
            ('t', 'u'): 1,
            ('t', 'n'): 2,
            ('t', 't'): 5,
            ('t', 'e'): 4,
            ('t', 'x'): 3,
            ('e', 'u'): 0,
            ('e', 'n'): 1,
            ('e', 't'): 4,
            ('e', 'e'): 4,
            ('e', 'x'): 3,
            ('x', 'u'): 0,
            ('x', 'n'): 0,
            ('x', 't'): 3,
            ('x', 'e'): 3,
            ('x', 'x'): 3,
        }
        self.window_woa_header_row_id = '__unit__'
        self.window_woa_header_row_type = 'string'
        self.window_woa_header_col_id = '__unit2__'
        self.window_woa_header_col_type = 'string'
        self.window_woa_col_type = {
            col_id: 'continuous' for col_id in self.window_woa_col_ids
            }
        self.window_woa_ref = IntPivotCrosstab(
            self.window_woa_row_ids,
            self.window_woa_col_ids,
            self.window_woa_values,
            self.window_woa_header_row_id,
            self.window_woa_header_row_type,
            self.window_woa_header_col_id,
            self.window_woa_header_col_type,
            self.window_woa_col_type,
        )
        #  Create the cooccurrence matrix for cooccurrence in window
        #  with window_size=3 and with annotation (wa):
        self.window_wa_row_ids = ['C', 'V']
        self.window_wa_col_ids = ['C', 'V']
        self.window_wa_values = {
            ('C', 'C'): 5,
            ('C', 'V'): 5,
            ('V', 'C'): 5,
            ('V', 'V'): 5,
        }
        self.window_wa_header_row_id = '__unit__'
        self.window_wa_header_row_type = 'string'
        self.window_wa_header_col_id = '__unit2__'
        self.window_wa_header_col_type = 'string'
        self.window_wa_col_type = {
            col_id: 'continuous' for col_id in self.window_wa_col_ids
            }
        self.window_wa_ref = IntPivotCrosstab(
            self.window_wa_row_ids,
            self.window_wa_col_ids,
            self.window_wa_values,
            self.window_wa_header_row_id,
            self.window_wa_header_row_type,
            self.window_wa_header_col_id,
            self.window_wa_header_col_type,
            self.window_wa_col_type,
        )
        # Create the cooccurrence matrix for cooccurrence in context
        # without the secondary unit (wos) and without annotation (woa):
        self.context_wos_woa_row_ids = ['u', 'n', 't', 'e', 'x']
        self.context_wos_woa_col_ids = ['u', 'n', 't', 'e', 'x']
        self.context_wos_woa_values = {
            ('u', 'u'): 1,
            ('u', 'n'): 1,
            ('u', 't'): 0,
            ('u', 'e'): 0,
            ('u', 'x'): 0,
            ('n', 'u'): 1,
            ('n', 'n'): 1,
            ('n', 't'): 0,
            ('n', 'e'): 0,
            ('n', 'x'): 0,
            ('t', 'u'): 0,
            ('t', 'n'): 0,
            ('t', 't'): 1,
            ('t', 'e'): 1,
            ('t', 'x'): 1,
            ('e', 'u'): 0,
            ('e', 'n'): 0,
            ('e', 't'): 1,
            ('e', 'e'): 1,
            ('e', 'x'): 1,
            ('x', 'u'): 0,
            ('x', 'n'): 0,
            ('x', 't'): 1,
            ('x', 'e'): 1,
            ('x', 'x'): 1,
        }
        self.context_wos_woa_header_row_id = '__unit__'
        self.context_wos_woa_header_row_type = 'string'
        self.context_wos_woa_header_col_id = '__unit2__'
        self.context_wos_woa_header_col_type = 'string'
        self.context_wos_woa_col_type = {
            col_id: 'continuous' for col_id in self.context_wos_woa_col_ids
            }
        self.context_wos_woa_ref = IntPivotCrosstab(
            self.context_wos_woa_row_ids,
            self.context_wos_woa_col_ids,
            self.context_wos_woa_values,
            self.context_wos_woa_header_row_id,
            self.context_wos_woa_header_row_type,
            self.context_wos_woa_header_col_id,
            self.context_wos_woa_header_col_type,
            self.context_wos_woa_col_type,
        )
        # Create the cooccurrence matrix for cooccurrence in context
        # without the secondary unit (wos) and with annotation (wa):
        self.context_wos_wa_row_ids = ['V', 'C']
        self.context_wos_wa_col_ids = ['V', 'C']
        self.context_wos_wa_values = {
            ('V', 'V'): 2,
            ('V', 'C'): 2,
            ('C', 'V'): 2,
            ('C', 'C'): 2,
        }
        self.context_wos_wa_header_row_id = '__unit__'
        self.context_wos_wa_header_row_type = 'string'
        self.context_wos_wa_header_col_id = '__unit2__'
        self.context_wos_wa_header_col_type = 'string'
        self.context_wos_wa_col_type = {
            col_id: 'continuous' for col_id in self.context_wos_wa_col_ids
            }
        self.context_wos_wa_ref = IntPivotCrosstab(
            self.context_wos_wa_row_ids,
            self.context_wos_wa_col_ids,
            self.context_wos_wa_values,
            self.context_wos_wa_header_row_id,
            self.context_wos_wa_header_row_type,
            self.context_wos_wa_header_col_id,
            self.context_wos_wa_header_col_type,
            self.context_wos_wa_col_type,
        )
        # Create the cooccurrence matrix for cooccurrence in context
        # with the secondary unit (ws) and without annotation (woa):
        self.context_ws_woa_col_ids = ['u', 'e']
        self.context_ws_woa_row_ids = ['n', 't', 'x']
        self.context_ws_woa_values = {
            ('n', 'u'): 1,
            ('n', 'e'): 0,
            ('t', 'u'): 0,
            ('t', 'e'): 1,
            ('x', 'u'): 0,
            ('x', 'e'): 1,
        }
        self.context_ws_woa_header_row_id = '__unit__'
        self.context_ws_woa_header_row_type = 'string'
        self.context_ws_woa_header_col_id = '__unit2__'
        self.context_ws_woa_header_col_type = 'string'
        self.context_ws_woa_col_type = {
            col_id: 'continuous' for col_id in self.context_ws_woa_col_ids
            }
        self.context_ws_woa_ref = IntPivotCrosstab(
            self.context_ws_woa_row_ids,
            self.context_ws_woa_col_ids,
            self.context_ws_woa_values,
            self.context_ws_woa_header_row_id,
            self.context_ws_woa_header_row_type,
            self.context_ws_woa_header_col_id,
            self.context_ws_woa_header_col_type,
            self.context_ws_woa_col_type,
        )
        # Create the cooccurrence matrix for cooccurrence in context
        # with the secondary unit (ws) and with annotation (wa):
        self.context_ws_wa_row_ids = ['C']
        self.context_ws_wa_col_ids = ['V']
        self.context_ws_wa_values = {
            ('C', 'V'): 2,
        }
        self.context_ws_wa_header_row_id = '__unit__'
        self.context_ws_wa_header_row_type = 'string'
        self.context_ws_wa_header_col_id = '__unit2__'
        self.context_ws_wa_header_col_type = 'string'
        self.context_ws_wa_col_type = {
            col_id: 'continuous' for col_id in self.context_ws_wa_col_ids
            }
        self.context_ws_wa_ref = IntPivotCrosstab(
            self.context_ws_wa_row_ids,
            self.context_ws_wa_col_ids,
            self.context_ws_wa_values,
            self.context_ws_wa_header_row_id,
            self.context_ws_wa_header_row_type,
            self.context_ws_wa_header_col_id,
            self.context_ws_wa_header_col_type,
            self.context_ws_wa_col_type,
        )
        self.output_cooc_in_window_woa = Processor.cooc_in_window(
            units={'segmentation': letter_seg},
            window_size=3,
        )
        self.output_cooc_in_window_wa = Processor.cooc_in_window(
            units={'segmentation': letter_seg, 'annotation_key': 'type'},
            window_size=3,
        )
        self.output_cooc_in_context_wos_woa = Processor.cooc_in_context(
            units={'segmentation': letter_seg},
            contexts={'segmentation': word_seg},
            units2=None,
        )
        self.output_cooc_in_context_wos_wa = Processor.cooc_in_context(
            units={'segmentation': letter_seg, 'annotation_key': 'type'},
            contexts={'segmentation': word_seg},
            units2=None,
        )
        self.output_cooc_in_context_ws_woa = Processor.cooc_in_context(
            units={'segmentation': vowel_seg},
            contexts={'segmentation': word_seg},
            units2={'segmentation': consonant_seg},
        )
        self.output_cooc_in_context_ws_wa = Processor.cooc_in_context(
            units={'segmentation': vowel_seg, 'annotation_key': 'type'},
            contexts={'segmentation': word_seg},
            units2={'segmentation': consonant_seg, 'annotation_key': 'type'},
        )
Exemplo n.º 40
0
        self.updateGUI()
        self.sendButton.sendIf()


if __name__ == '__main__':
    import sys
    from PyQt5.QtWidgets import QApplication
    import LTTL.Segmenter as Segmenter
    from LTTL.Input import Input

    appl = QApplication(sys.argv)
    ow = OWTextableLength()
    seg1 = Input(u'hello world', label=u'text1')
    seg2 = Input(u'wonderful world', label=u'text2')
    seg3 = Segmenter.concatenate([seg1, seg2], label=u'corpus')
    seg4 = Segmenter.tokenize(seg3, [(
        r'\w+(?u)',
        u'tokenize',
    )],
                              label=u'words')
    seg5 = Segmenter.tokenize(seg3, [(
        r'\w',
        u'tokenize',
    )], label=u'letters')
    ow.inputData(seg3, 1)
    ow.inputData(seg4, 2)
    ow.inputData(seg5, 3)
    ow.show()
    appl.exec_()
    ow.saveSettings()
if __name__ == '__main__':
    import sys
    from PyQt5.QtWidgets import QApplication
    import LTTL.Segmenter as Segmenter
    from LTTL.Input import Input

    appl = QApplication(sys.argv)
    ow = OWTextableCooccurrence()
    seg1 = Input(u'un texte', label=u'text')
    seg2 = Segmenter.tokenize(
        seg1,
        regexes=[(
            re.compile(r'\w+'),
            u'tokenize',
            {
                'type': 'W'
            },
        )],
        label=u'words',
    )
    seg3 = Segmenter.tokenize(
        seg1,
        regexes=[(
            re.compile(r'[aeiouy]'),
            u'tokenize',
            {
                'type': 'V'
            },
        )],
        label=u'vowel',
    def getTitleListFromEighteenthCenturyPoetry(self):
        """Fetch titles from the ECP website"""

        self.infoBox.customMessage(
            "Fetching data from ECP website, please wait"
        )

        # Attempt to connect to ECP...
        try:
            response = urllib.request.urlopen(self.base_url)
            base_html = response.read().decode('iso-8859-1')
            self.infoBox.customMessage(
                "Done fetching data from EighteenthCenturyPoetry website."
            )

        # If unable to connect (somehow)...
        except:

            # Set Info box and widget to "warning" state.
            self.infoBox.noDataSent(
                warning="Couldn't access EighteenthCenturyPoetry website."
            )

            # Empty title list box.
            self.titleLabels = list()

            # Reset output channel.
            self.send("XML-TEI data", None, self)
            return None

        # Otherwise store HTML content in LTTL Input object.
        base_html_seg = Input(base_html)

        # Remove accents from the data...
        recoded_seg = Segmenter.recode(base_html_seg, remove_accents=True)

        # Extract works.
        genre_corpus = Segmenter.import_xml(
            segmentation=recoded_seg,
            element="ul",
            conditions={"class": re.compile(r"^genres-list$")},
        )
        genre_list = Segmenter.tokenize(
            segmentation=genre_corpus,
            regexes=re.compile(r"<a.+$"),
            import_annotations=False,
            merge_duplicates=True,
        )
        work_list = Segmenter.tokenize(
            segmentation=genres_list,
            regexes=re.compile(r"<li class="bibl">(.+?)</li>"),
            import_annotations=False,
            merge_duplicates=True,
        )

        # Compile the regex that will be used to parse each line.
        field_regex = re.compile(
            r"<a href="(.+?)">"
            r"<a href=".+?">(.+?)</a>"
            r"<span style="color:.+?666">(.+?)</span>"
        )

        # Parse each line and store the resulting segmentation in an attribute.
        titleSeg = Segmenter.tokenize(
            segmentation=work_list,
            regexes=[
                (field_regex, "tokenize", {"url": "&1"}),
                (field_regex, "tokenize", {"title": "&2"}),
                (field_regex, "tokenize", {"author": "&3"}),
            ],
            import_annotations=False,
            merge_duplicates=True,
        )


        # Try to save list in this module"s directory for future reference...
        path = os.path.dirname(
            os.path.abspath(inspect.getfile(inspect.currentframe()))
        )
        try:
            file = open(os.path.join(path, "cached_title_list"), "wb")
            pickle.dump(titleSeg, file, -1)
            file.close()
        except IOError:
            pass

        # Remove warning (if any)...
        self.error(0)
        self.warning(0)

        return titleSeg
Exemplo n.º 43
0
            self.contextAnnotationKey = self.contextAnnotationKey

    def handleNewSignals(self):
        """Overridden: called after multiple signals have been added"""
        self.openContext(self.uuid, self.segmentations)
        self.updateGUI()
        self.sendButton.sendIf()


if __name__ == '__main__':
    import sys

    from PyQt4.QtGui import QApplication
    import LTTL.Segmenter as Segmenter
    from LTTL.Input import Input

    appl = QApplication(sys.argv)
    ow = OWTextableCount()
    seg1 = Input(u'hello world', label=u'text1')
    seg2 = Input(u'cruel world', label=u'text2')
    seg3 = Segmenter.concatenate([seg1, seg2], label=u'corpus')
    seg4 = Segmenter.tokenize(seg3, [(r'\w+(?u)', u'tokenize', {
        'type': 'mot'
    })],
                              label=u'words')
    ow.inputData(seg3, 1)
    ow.inputData(seg4, 2)
    ow.show()
    appl.exec_()
    ow.saveSettings()
Exemplo n.º 44
0
        else:
            text_or_id = kwargs.get("text_or_id", None)

        if isinstance(text_or_id, str) or text_or_id is None:
            self._currentErrorMessage = text_or_id or ""
        return super().error(*args, **kwargs)

    def warning(self, *args, **kwargs):
        # Reimplemented to track the current active warning message
        if args:
            text_or_id = args[0]
        else:
            text_or_id = kwargs.get("text_or_id", None)

        if isinstance(text_or_id, str) or text_or_id is None:
            self._currentWarningMessage = text_or_id or ""
        return super().warning(*args, **kwargs)


if __name__ == '__main__':
    appl = QApplication(sys.argv)
    ow = OWTextableDisplay()
    ow.show()
    seg1 = Input(u'hello world', label=u'text1')
    seg2 = Input(u'cruel world', label=u'text2')
    seg3 = Segmenter.concatenate([seg1, seg2], label=u'corpus')
    seg4 = Segmenter.tokenize(seg3, [(r'\w+(?u)', u'tokenize')],
                              label=u'words')
    ow.inputData(seg4)
    appl.exec_()