Пример #1
0
def test_cv_templates(test_data):
    assert cv_templates(Wordlist(str(test_data / 'KSL5.qlc')),
                        'French',
                        output='markdown')
    patterns, _, sounds = cv_templates(Wordlist(str(test_data / 'KSL5.qlc')),
                                       'French',
                                       output=None)
Пример #2
0
 def test_plots(self):
     plot_gls(self.gls, self.tree, filename=text_type(self.tmp_path('test')))
     plot_tree(self.tree, filename=text_type(self.tmp_path('test')))
     plot_concept_evolution(self.scenarios, self.tree,
                            filename=text_type(self.tmp_path('test')))
     
     wl = Wordlist(test_data('KSL.qlc'))
     wl.calculate('tree')
     plot_heatmap(wl, filename=text_type(self.tmp_path('test')),
                  ref="cogid", refB="cogid", steps=1)
Пример #3
0
def test_colexification_network(test_data, tmppath):
    graph = colexification_network(Wordlist(str(test_data / 'colexification.tsv')))
    assert "hand" in graph and "arm" in graph

    graph = colexification_network(Wordlist(str(test_data / 'colexification.tsv')), bipartite=True)
    assert 'arm' in graph['l4.4'] and 'hand' in graph['l4.4']

    _ = colexification_network(
        Wordlist(str(test_data / 'colexification.tsv')),
        output="gml",
        filename=str(tmppath / "test"))
Пример #4
0
    def test_cache(self):
        filename = 'lingpy_test.qlc'
        self.parser.pickle(filename=filename)
        from_cache = QLCParser.unpickle(filename)
        self.assertEqual(self.parser.header, from_cache.header)
        os.remove(str(path(filename)))

        wl = Wordlist(test_data('KSL.qlc'))
        wl.pickle(filename=filename)
        from_cache = Wordlist.unpickle(filename)
        self.assert_(from_cache._class)
        os.remove(str(path(filename)))
Пример #5
0
def test_plots(mocker, Plt, Sch, gls, tree, scenarios, tmppath, test_data):
    mocker.patch('lingpy.convert.plot.mpl', new=mocker.MagicMock())
    mocker.patch('lingpy.convert.plot.plt', new=Plt)
    mocker.patch('lingpy.convert.plot.sch', new=Sch)

    plot_gls(gls, tree, filename=str(tmppath / 'test'))
    plot_tree(tree, filename=str(tmppath / 'test'))
    plot_concept_evolution(scenarios, tree, filename=str(tmppath / 'test'))

    wl = Wordlist(str(test_data /'KSL.qlc'))
    wl.calculate('tree')
    plot_heatmap(wl, filename=str(tmppath / 'test'), ref="cogid", refB="cogid", steps=1)
Пример #6
0
    def test_cache(self):
        filename = 'lingpy_test.qlc'
        self.parser.pickle(filename=filename)
        from_cache = QLCParser.unpickle(filename)
        self.assertEqual(self.parser.header, from_cache.header)
        os.remove(str(path(filename)))

        wl = Wordlist(test_data('KSL.qlc'))
        wl.pickle(filename=filename)
        from_cache = Wordlist.unpickle(filename)
        self.assert_(from_cache._class)
        os.remove(str(path(filename)))
Пример #7
0
    def test_colexification_network(self):
        graph = colexification_network(
            Wordlist(test_data('colexification.tsv')))
        assert "hand" in graph and "arm" in graph

        graph = colexification_network(Wordlist(
            test_data('colexification.tsv')),
                                       bipartite=True)
        assert 'arm' in graph['l4.4'] and 'hand' in graph['l4.4']

        _ = colexification_network(Wordlist(test_data('colexification.tsv')),
                                   output="gml",
                                   filename=text_type(self.tmp_path("test")))
Пример #8
0
    def test_plots(self):
        plot_gls(self.gls,
                 self.tree,
                 filename=text_type(self.tmp_path('test')))
        plot_tree(self.tree, filename=text_type(self.tmp_path('test')))
        plot_concept_evolution(self.scenarios,
                               self.tree,
                               filename=text_type(self.tmp_path('test')))

        wl = Wordlist(test_data('KSL.qlc'))
        wl.calculate('tree')
        plot_heatmap(wl,
                     filename=text_type(self.tmp_path('test')),
                     ref="cogid",
                     refB="cogid",
                     steps=1)
Пример #9
0
def test_load_from_cldf_metadata(test_data):
    wl = Wordlist.from_cldf(str(test_data / 'cldf/test-metadata.json'),
                            col="Language_ID".lower(),
                            row="Parameter_ID".lower())

    assert wl.width == 29
    assert wl.height == 1
    assert wl.entries[0] == 'alignment'
    assert wl.cols[0] == 'anuta'.lower()
    assert wl.cols[28] == 'wallisian'
Пример #10
0
    def test_load_from_cldf_metadatafree(self):
        wl = Wordlist.from_cldf(test_data('cldf/forms.csv'),
                                col="Language_ID".lower(),
                                row="Parameter_ID".lower())

        assert wl.width == 29
        assert wl.height == 1
        assert wl.entries[0] == 'alignment'
        assert wl.cols[0] == 'anuta'.lower()
        assert wl.cols[28] == 'wallisian'
Пример #11
0
    def test_load_from_cldf_metadata(self):
        wl = Wordlist.from_cldf(
            test_data('cldf/test-metadata.json'),
            col="Language_ID".lower(),
            row="Parameter_ID".lower())

        assert wl.width == 29
        assert wl.height == 1
        assert wl.entries[0] == 'alignment'
        assert wl.cols[0] == 'anuta'.lower()
        assert wl.cols[28] == 'wallisian'
Пример #12
0
def test_load_from_cldf_metadatafree(test_data):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        wl = Wordlist.from_cldf(str(test_data / 'cldf/forms.csv'),
                                col="Language_ID".lower(),
                                row="Parameter_ID".lower())

    assert wl.width == 29
    assert wl.height == 1
    assert wl.entries[0] == 'alignment'
    assert wl.cols[0] == 'anuta'.lower()
    assert wl.cols[28] == 'wallisian'
Пример #13
0
def filter_wordlist(wordlist, lang1, lang2):
    """
	Expects and returns a Wordlist instance, with the returned one retaining
	only entries of the two languages given.
	"""
    new_data = {}  # the data formatted as LexStat wants it
    new_data[0] = ['doculect', 'concept', 'ipa', 'index', 'tokens']  # header

    key = 1
    for entry in wordlist._data.values():
        if entry[0] in (lang1, lang2):
            new_data[key] = entry
            key += 1

    return Wordlist(new_data)
Пример #14
0
def make_wordlist(data, dataset_path, schema='ipa'):
    """
	Expects {lang: {gloss: [ipa,]}}; returns a Wordlist instance.
	The last column of the header is needed for the sample ID.
	"""
    try:
        tokens = load_tokens(dataset_path, schema)
        assert len(tokens) == len(data)
    except AssertionError:
        raise ValueError('Could not find tokens in {}'.format(dataset_path))

    new_data = {}  # the data formatted as LexStat wants it
    new_data[0] = ['doculect', 'concept', 'ipa', 'index', 'tokens']  # header

    key = 1
    for lang in sorted(data.keys()):
        for gloss in sorted(data[lang].keys()):
            for index, ipa in enumerate(data[lang][gloss]):
                new_data[key] = [lang, gloss, ipa, index + 1]
                new_data[key].append(tokens[lang][gloss][index])
                key += 1

    return Wordlist(new_data)
Пример #15
0
 def test_load_cldf_and_write(self):
     wl = Wordlist.from_cldf(
         test_data('cldf/test-metadata.json'),
         col="Language_ID".lower(),
         row="Parameter_ID".lower())
     wl.output('tsv', filename=str(self.tmp_path('lingpycldf')))
Пример #16
0
def test_simple_profile(test_data):
    wl = Wordlist(str(test_data / 'KSL6.qlc'))
    prf = list(simple_profile(wl))
    assert ('a', 'a', '7', 'U+0061') in prf
    prf = list(simple_profile(wl, clts={'a': 'A'}))
    assert prf[0][1] == 'A'
Пример #17
0
 def test_load_non_wordlist_cldf(self):
     wl = Wordlist.from_cldf(test_data('cldf/non-wordlist-metadata.json'),
                             col="Language_ID".lower(),
                             row="Parameter_ID".lower())
Пример #18
0
def test_load_noexisting_cldf(test_data):
    with pytest.raises(FileNotFoundError):
        wl = Wordlist.from_cldf(str(test_data /
                                    'cldf/test-missing-metadata.json'),
                                col="Language_ID".lower(),
                                row="Parameter_ID".lower())
Пример #19
0
def test_load_non_wordlist_cldf(test_data):
    with pytest.raises(ValueError):
        wl = Wordlist.from_cldf(str(test_data /
                                    'cldf/non-wordlist-metadata.json'),
                                col="Language_ID".lower(),
                                row="Parameter_ID".lower())
Пример #20
0
def wordlist(test_data):
    return Wordlist(str(test_data / 'GER.tsv'))
Пример #21
0
 def setUp(self):
     WithTempDir.setUp(self)
     self.wordlist = Wordlist(test_data('GER.tsv'))
Пример #22
0
 def test_load_cldf_and_write(self):
     wl = Wordlist.from_cldf(test_data('cldf/test-metadata.json'),
                             col="Language_ID".lower(),
                             row="Parameter_ID".lower())
     wl.output('tsv', filename=str(self.tmp_path('lingpycldf')))
Пример #23
0
 def test_load_non_wordlist_cldf(self):
     wl = Wordlist.from_cldf(
         test_data('cldf/non-wordlist-metadata.json'),
         col="Language_ID".lower(),
         row="Parameter_ID".lower())
Пример #24
0
def wordlist(test_data):
    return Wordlist(str(test_data / 'colexification.tsv'))
Пример #25
0
 def test_load_noexisting_cldf(self):
     wl = Wordlist.from_cldf(
         test_data('cldf/test-missing-metadata.json'),
         col="Language_ID".lower(),
         row="Parameter_ID".lower())
Пример #26
0
def wl(test_data):
    return Wordlist(str(test_data / 'KSL5.qlc'))
Пример #27
0
def test_load_cldf_and_write(test_data, tmppath):
    wl = Wordlist.from_cldf(str(test_data / 'cldf/test-metadata.json'),
                            col="Language_ID".lower(),
                            row="Parameter_ID".lower())
    wl.output('tsv', filename=str(tmppath / 'lingpycldf'))
Пример #28
0
    def __init__(self, filepath, ngram_length=2):
        """
        Matrix module to format wordlist data into various 2D matrices.
        """

        # TODO: add a debug parameter

        # Get a Wordlist object given the specified input file
        self.wl = Wordlist(filepath)
        self.ngram_length = ngram_length

        # check for language, concept, counterpart in Wordlist object; if missing data, fail
        self.wl_header = self.wl.header
        print(self.wl_header)

        # TODO: check for the items in the header
        """
        if all (k in self.wl_header for k in ("doculect", "concept", "orthoparse")):
            print("Matrix module input requires language ('doculect'), concept ('meaning'), and a qlc-format orthographic parse of the counterpart ('translation') and in wordlist object")
            sys.exit(1)

        sys.exit(1)
        """

        # if not, add one
        # print(self.wl.__getitem__(1))
        # print(self.wl[1,'orthoparse'])
        # a = lambda x:x.split('a')
        # wl.add_entries('juicyIPA','ipa',lambda x:x+x)

        # data structures to store various counts
        self._ngram_to_split_ngram = collections.defaultdict()  # {"pb":"p_b"}

        # { word_id : { "#w_id":1, "wo_id":1, ...} } -- not ordered
        self._words_ngrams_counts = collections.defaultdict(
            lambda: collections.defaultdict(int))

        # { word_id : ["#w_id", "wo_id", ...] } -- ordered
        self._words_ngrams = collections.defaultdict(list)

        # data stored: {language: {counterpart: count} }
        self._languages_words_counts = collections.defaultdict(
            lambda: collections.defaultdict(int))

        # data stored: {concept: {counterpart: count} }
        self._concepts_words_counts = collections.defaultdict(
            lambda: collections.defaultdict(int))

        # data containers - using sets to discard duplicate ngrams
        non_unique_parsed_words = set()
        non_unique_ngrams = set()
        languages = set()
        concepts = set()
        unique_ngrams = set()

        # loop over the wordlist data and parse into data strcutres
        for key in self.wl:
            language = self.wl[key, 'doculect']
            language = language.replace("_", "-")  # fix
            concept = self.wl[key, 'concept']
            counterpart = self.wl[key, 'orthoparse']
            # print(taxa, gloss, counterpart)

            # loop over the corpus reader data and parse into data structures
            """
            for language, concept, counterpart in language_concept_counterpart_iterator:
            # First do orthography parsing.
            if gram_type == "graphemes":
                parsed_counterpart_tuple = orthography_parser.parse_string_to_graphemes(counterpart) # graphemes
            elif gram_type == "phonemes":
                parsed_counterpart_tuple = orthography_parser.parse_string_to_ipa_phonemes(counterpart) # phonemes
            else:
                sys.exit('\ninvalid gram type: specify "phonemes" or "graphemes"\n')
                
            # TODO: move this to orthography parser
            # If string is unparsable, write to file.
            if parsed_counterpart_tuple[0] == False:
                invalid_parse_string = qlc.ngram.formatted_string_from_ngrams(parsed_counterpart_tuple[1])
                unparsables.write(language+"\t"+concept+"\t"+counterpart+"\t"+invalid_parse_string+"\n")
                continue
                """

            # parsed_counterpart = parsed_counterpart_tuple[1]

            counterpart = "# " + counterpart + " #"
            parsed_counterpart = tuple(counterpart.split())

            # Get ngrams as a tuple of tuples.
            # ngram_tuples = qlc.ngram.ngrams_from_graphemes(parsed_counterpart, ngram_length)
            ngram_tuples = ng.ngrams_from_graphemes(parsed_counterpart,
                                                    ngram_length)

            # Format that tuple of tuples into a space-delimed string.
            ngrams_string = ng.formatted_string_from_ngrams(ngram_tuples)
            # print(ngrams_string)

            # Format tuple into unigrams split on "_" into a space-delimited string.
            split_ngrams_string = ng.split_formatted_string_from_ngrams(
                ngram_tuples)
            # print(split_ngrams_string)

            # check to make sure ngrams string ("#a ab b#")
            # and split ngrams string ("#_a a_b b_#") are the same
            ngrams_string_list = ngrams_string.split()
            split_ngrams_string_list = split_ngrams_string.split()
            if len(ngrams_string_list) != len(split_ngrams_string_list):
                print("ngrams string and split ngrams sting do not match")
                sys.exit(1)

            # store key value pairs for ngram and split ngram; if unigram store the same
            for i in range(0, len(ngrams_string_list)):
                if self.ngram_length > 1:
                    self._ngram_to_split_ngram[
                        ngrams_string_list[i]] = split_ngrams_string_list[i]
                else:
                    self._ngram_to_split_ngram[
                        ngrams_string_list[i]] = ngrams_string_list[i]

            # Get the parsed version of counterparts.
            parsed_word = ng.formatted_string_from_ngrams(parsed_counterpart)
            # print("og: ", parsed_word)
            parsed_word = parsed_word.replace(" ", "")
            parsed_word = parsed_word.lstrip("#")
            parsed_word = parsed_word.rstrip("#")
            parsed_word = parsed_word.replace("#", " ")
            # print("pg: ", parsed_word)
            # print()

            # flipped
            # parsed_word_id = parsed_word+"_"+language
            parsed_word_id = language + "_" + parsed_word

            # if parsed_word not in dict:

            if not parsed_word_id in self._words_ngrams_counts:
                for ngram in ngrams_string.split():
                    # flipped
                    # non_unique_ngram = language+"_"+ngram
                    non_unique_ngram = language + "_" + ngram
                    non_unique_ngrams.add(non_unique_ngram)
                    self._words_ngrams_counts[parsed_word_id][
                        non_unique_ngram] += 1
                    self._words_ngrams[parsed_word_id].append(non_unique_ngram)

            # update data structures
            # self._languages_words_counts[language][parsed_word+"_"+language] += 1
            # self._concepts_words_counts[concept][parsed_word+"_"+language] += 1

            # flipped
            self._languages_words_counts[language][language + "_" +
                                                   parsed_word] += 1
            self._concepts_words_counts[concept][language + "_" +
                                                 parsed_word] += 1

            # add to header lists
            languages.add(
                language)  # Append languages to unique set of langauge.
            concepts.add(concept)  # Append concepts to unique set of concepts.
            unique_ngrams.update(
                set(ngram_tuples)
            )  # Append all the elements of ngram_tuples to unique_ngrams.

            # add to non-unique header lists
            # non_unique_parsed_words.add(parsed_word+"_"+language)
            # flipped
            non_unique_parsed_words.add(language + "_" + parsed_word)

        # listfy to sort
        self.languages = list(languages)
        self.languages.sort()

        self.concepts = list(concepts)
        self.concepts.sort()

        self.non_unique_parsed_words = list(non_unique_parsed_words)
        self.non_unique_parsed_words.sort()

        self.non_unique_ngrams = list(non_unique_ngrams)
        self.non_unique_ngrams.sort()

        self.unique_ngrams = list(unique_ngrams)
        self.unique_ngrams.sort()
Пример #29
0
 def setUp(self):
     self.wl = Wordlist(test_data('KSL5.qlc'))
Пример #30
0
def test_context_profile(test_data):
    wl = Wordlist(str(test_data / 'KSL6.qlc'))
    prf = list(context_profile(wl))
    assert prf[2][-2] == '4'  # first line of profile
    prf = list(context_profile(wl, clts={'a': 'A'}))
    assert prf[2][1] == 'A'
Пример #31
0
 def setUp(self):
     WithTempDir.setUp(self)
     self.wordlist = Wordlist(test_data('colexification.tsv'))
     self.cols = colx._get_colexifications(self.wordlist)
Пример #32
0
 def test_load_noexisting_cldf(self):
     wl = Wordlist.from_cldf(test_data('cldf/test-missing-metadata.json'),
                             col="Language_ID".lower(),
                             row="Parameter_ID".lower())