class Dataset(BaseDataset):
    dir = Path(__file__).parent
    id = "mcelhanonhuon"

    form_spec = FormSpec(missing_data=("-", ""))

    def cmd_makecldf(self, args):
        """
        Convert the raw data to a CLDF dataset.
        """
        args.writer.add_sources()
        languages = args.writer.add_languages(lookup_factory=lambda l: l[
            "ID"].lower()  # lower case in raw data, so title case
                                              )
        concepts = args.writer.add_concepts(
            id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english),
            lookup_factory="Name")

        cog = CognateRenumber()

        for row in self.raw_dir.read_csv("mcelhanon-1967.tsv",
                                         dicts=True,
                                         delimiter="\t"):
            lex = args.writer.add_forms_from_value(
                Local_ID=row["ID"],
                Language_ID=languages[row["Language"]],
                Parameter_ID=concepts[row["Word"]],
                Value=row["Gloss"],
                Comment=row["Annotation"],
                Source="McElhanon1967",
            )

            cognates = row["Cognacy"].split(",")

            if len(cognates) == 0:
                # singleton
                cog_id = cog.get_cogid()
            elif len(cognates) == 1:
                cog_id = cog.get_cogid(cognates[0])
            else:
                raise ValueError(
                    "Multiple cognates per lexeme are not handled")

            assert len(lex) == 1, "Should only have one lexeme"
            args.writer.add_cognate(lexeme=lex[0],
                                    Cognateset_ID=cog_id,
                                    Source="McElhanon1967")
Пример #2
0
class Dataset(BaseDataset):
    dir = Path(__file__).parent
    id = "deepadungpalaung"
    concept_class = CustomConcept
    language_class = CustomLanguage
    form_spec = FormSpec(
            separators=',',
            )

    def cmd_makecldf(self, args):
        args.writer.add_sources()

        concepts = {}
        for concept in self.conceptlists[0].concepts.values():
            idx = concept.id.split("-")[-1] + "_" + slug(concept.english)
            args.writer.add_concept(
                ID=idx,
                Name=concept.english,
                Number=concept.number,
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
            )
            concepts[concept.number] = idx
        languages = args.writer.add_languages(lookup_factory="Name")

        # here we need to add the lexemes
        data = self.raw_dir.read_csv('100item-phylo.Sheet1.csv', dicts=False)
        for i, row in progressbar(enumerate(data[4:])):
            number = row[0].strip().strip('.')
            for j in range(0, len(row)-2, 2):
                language = data[2][j+2]
                value = row[j+2]
                if value.strip() and value.strip() not in ['-----']:
                    if not 'or' in row[3+j]:
                        cogid = str(int(float(row[j+3])))
                    else:
                        cogid = row[j+3].split()[0]
                    for lexeme in args.writer.add_forms_from_value(
                            Parameter_ID=concepts[number],
                            Language_ID=languages[language],
                            Value=value.strip(),
                            Source='Deepadung2015'):
                        args.writer.add_cognate(
                                lexeme=lexeme,
                                Cognateset_ID=cogid+'-'+number,
                                Source='Deepadung2015')
Пример #3
0
class Dataset(BaseDataset):
    dir = Path(__file__).parent
    id = "zhaobai"
    concept_class = CustomConcept
    language_class = CustomLanguage
    form_spec = FormSpec(separators=";/,")

    def cmd_makecldf(self, args):
        args.writer.add_sources()

        # TODO: add concepts with `add_concepts`
        args.writer.add_language(
            ID="ZhaozhuangBai",
            Glottocode="dali1242",
            ChineseName="趙莊白語",
            Name="Zhaozhuang Bai",
            Latitude=25.5844078,
            Longitude=100.3117,
            Family="Sino-Tibetan",
            DialectGroup="Southern Bai",
        )

        for concept in self.conceptlists[0].concepts.values():
            idx = concept.number + "_" + slug(concept.gloss)
            args.writer.add_concept(
                ID=idx,
                Name=concept.gloss,
                Chinese_Gloss=concept.attributes["chinese"],
                Number=concept.number,
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
            )
            args.writer.add_forms_from_value(
                Language_ID="ZhaozhuangBai",
                Parameter_ID=idx,
                Value=concept.attributes["form"],
                Source="Zhao2006",
            )

        # We explicitly remove the ISO column since none of the languages in
        # this dataset have an ISO code.
        args.writer.cldf["LanguageTable"].tableSchema.columns = [
            col
            for col in args.writer.cldf["LanguageTable"].tableSchema.columns
            if col.name != "ISO639P3code"
        ]
class Dataset(BaseDataset):
    dir = Path(__file__).parent
    id = "zgraggenmadang"
    language_class = CustomLanguage
    form_spec = FormSpec(
        missing_data=[
            "-0̸-",
            "(ya)-",
            "xx kater",
            "Vb -0̸-",
            "-",
            "0̸",
            "0-",
            "?",
            "-",
            "- ",
            "0̸-",
            "_",
            "-0̸",
        ],
        replacements=[(" ", "_"), ("_+_give", "")],
    )

    def cmd_download(self, args):
        pass

    def cmd_makecldf(self, args):
        args.writer.add_sources()
        languages = args.writer.add_languages(id_factory=lambda l: l["Name"],
                                              lookup_factory=lambda l:
                                              (l["Name"], l["Source"]))
        sources = {k[0]: k[1] for k in languages}  # language: source map
        concepts = args.writer.add_concepts(
            id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english),
            lookup_factory="Name")
        for row in progressbar(
                self.raw_dir.read_csv("madang.csv", dicts=True,
                                      delimiter="\t")):
            concept = CONCEPT_REMAPPING.get(row["CONCEPT"], row["CONCEPT"])
            args.writer.add_forms_from_value(
                Local_ID=row["ID"],
                Language_ID=row["DOCULECT"],
                Parameter_ID=concepts[concept],
                Value=row["COUNTERPART"],
                Source=sources[row["DOCULECT"]],
            )
Пример #5
0
class Dataset(BaseDataset):
    dir = pathlib.Path(__file__).parent
    id = "aaleykusunda"
    language_class = CustomLanguage
    form_spec = FormSpec(separators="~;,/",
                         missing_data=["∅"],
                         first_form_only=True)

    def cmd_makecldf(self, args):
        # add bib
        args.writer.add_sources()
        args.log.info("added sources")

        # add concept
        concepts = args.writer.add_concepts(
            id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english),
            lookup_factory="Name",
        )
        # fix concept lookup
        concepts["the barley (Tibetan or highland)"] = concepts[
            "the barley (tibetan or highland)"]
        concepts["to plant (vegetables, rice)"] = concepts[
            "to plant (vegetals, rice)"]
        args.log.info("added concepts")

        # add language
        languages = args.writer.add_languages(lookup_factory="Name")
        args.log.info("added languages")

        # read in data
        data = self.raw_dir.read_csv("Kusunda_2019_250_lexical_items.tsv",
                                     delimiter="\t",
                                     dicts=True)
        # add data
        for entry in pb(data, desc="cldfify", total=len(data)):
            if entry["ENGLISH"] in concepts.keys():
                for key, val in languages.items():
                    args.writer.add_forms_from_value(
                        Language_ID=val,
                        Parameter_ID=concepts[entry["ENGLISH"]],
                        Value=entry[key],
                        Source=["Bodt2019b"],
                    )
Пример #6
0
class Dataset(BaseDataset):
    dir = pathlib.Path(__file__).parent
    id = "walworthpolynesian"
    
    form_spec = FormSpec(first_form_only=True)

    def cmd_makecldf(self, args):
        args.writer.add_sources(*self.raw_dir.read_bib())
        
        languages = args.writer.add_languages(
            lookup_factory=lambda l: l['Name']
        )
        concepts = args.writer.add_concepts(
            id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english),
            lookup_factory="Name",
        )
        concepts['ash'] = '146_ashes'
        
        
        wl = Wordlist(str(self.raw_dir / 'polynesian-aligned_22112018_corrected.tsv'))
        
        for idx in sorted(wl):
            wl[idx, 'segments'] = fix_segments(wl[idx, 'segments'])
            
            lex = args.writer.add_form_with_segments(
                Language_ID=languages.get(wl[idx, 'doculect']),
                Parameter_ID=concepts.get(wl[idx, 'concept']),
                Value=wl[idx, 'value'],
                Form=wl[idx, 'form'],
                Segments=[{'_': '+', "mh": "mʰ"}.get(x, x) for x in wl[idx, 'segments']],
                Source=[wl[idx, 'source']],
                Cognacy=wl[idx, 'cogid'],
                Loan=to_boolean(wl[idx, 'loan']),
                Comment=wl[idx, 'comment']
            )
            
            args.writer.add_cognate(
                lexeme=lex,
                Source=['walworth_mary_2018_1689909'],
                Cognateset_ID=wl[idx, 'cogid']
            )
Пример #7
0
class Dataset(BaseDataset):
    dir = Path(__file__).parent
    id = "kinbank"
    language_class = CustomLanguage
    concept_class = CustomConcept

    form_spec = FormSpec(brackets={
        "[": "]",
        "{": "}",
        "(": ")",
        "‘": "’"
    },
                         separators=";/,",
                         missing_data=('?', '-', '', ''),
                         strip_inside_brackets=True)

    def cmd_makecldf(self, args):
        languages = args.writer.add_languages(lookup_factory='Label')

        concepts = args.writer.add_concepts(id_factory=lambda c: c.id,
                                            lookup_factory="Parameter")

        for filename in sorted(self.raw_dir.glob("*/*.csv")):
            lang_id = languages[filename.stem]
            for row in self.raw_dir.read_csv(filename, dicts=True):

                concept_id = concepts.get(row['parameter'], row['parameter'])

                # default to IPA column if present otherwise use word column
                value = row['ipa'] if len(row['ipa']) else row['word']
                if value:
                    lex = args.writer.add_forms_from_value(
                        Language_ID=lang_id,
                        Parameter_ID=concept_id,
                        Value=value,
                        Comment=row['comment'],
                        Source=row['source_bibtex'],
                    )

        args.writer.add_sources()
Пример #8
0
class Dataset(BaseDataset):
    dir = Path(__file__).parent
    id = "chindialectsurvey"

    # add your personalized data types here
    concept_class = CustomConcept
    language_class = CustomLanguage

    # define the way in which forms should be handled
    form_spec = FormSpec(
        brackets={
            "(": ")",
            "[": "]"
        },
        separators=";/,",
        missing_data=("?", "-"),
        strip_inside_brackets=True,
    )

    def cmd_makecldf(self, args):
        """
        Convert the raw data to a CLDF dataset.
        """
        data = self.raw_dir.read_csv('wordlist.tsv',
                                     dicts=True,
                                     delimiter='\t')
        args.writer.add_sources()
        languages = args.writer.add_languages(lookup_factory="ID")
        concepts = args.writer.add_concepts(
            id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english),
            lookup_factory="Name")

        for row in progressbar(data, desc="cldfify"):
            if row["DOCULECT"] in languages:
                args.writer.add_forms_from_value(
                    Language_ID=row["DOCULECT"],
                    Parameter_ID=concepts[row["CONCEPT"]],
                    Value=row["TRANSCRIPTION"],
                    Source=["chinds"],
                )
Пример #9
0
class Dataset(BaseDataset):
    dir = Path(__file__).parent
    id = "chingelong"
    language_class = CustomLanguage
    form_spec = FormSpec(missing_data=("---", ),
                         separators="/",
                         replacements=[(" ", "_")])

    def cmd_makecldf(self, args):
        """
        Convert the raw data to a CLDF dataset.
        """
        concepts = {}
        for concept in self.conceptlists[0].concepts.values():
            cid = '{0}_{1}'.format(concept.number, slug(concept.english))
            args.writer.add_concept(
                ID=cid,
                Name=concept.english,
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
            )
            concepts[concept.english] = cid
        languages = args.writer.add_languages(lookup_factory="Name_in_Source")
        args.writer.add_sources()

        for row in self.raw_dir.read_csv('data.tsv',
                                         delimiter='\t',
                                         dicts=True):
            for language, lid in languages.items():
                form = row[language].strip()
                if form and form != '---':
                    args.writer.add_forms_from_value(
                        Language_ID=lid,
                        Parameter_ID=concepts[row['English']],
                        Value=form,
                        Source='Chin2015')
Пример #10
0
class Dataset(BaseDataset):
    dir = Path(__file__).parent
    id = "deepadungpalaung"
    concept_class = CustomConcept
    language_class = CustomLanguage
    form_spec = FormSpec(separators=',', )

    def cmd_download(self, args):
        print('updating ...')
        with open(self.raw_dir.joinpath("deepadungpalaung.tsv"),
                  "w",
                  encoding="utf-8") as f:
            f.write(fetch("deepadungpalaung"))

    def cmd_makecldf(self, args):
        args.writer.add_sources()

        concepts = {}
        for concept in self.conceptlists[0].concepts.values():
            idx = concept.id.split("-")[-1] + "_" + slug(concept.english)
            args.writer.add_concept(
                ID=idx,
                Name=concept.english,
                Number=concept.number,
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
            )
            concepts[concept.number] = idx
        languages = args.writer.add_languages(lookup_factory="Name")

        # we combine with the manually edited wordlist to retrieve the lexeme
        # values
        wl = Wordlist(self.raw_dir.joinpath('deepadungpalaung.tsv').as_posix())
        mapper = {
            (concept, language, normalize("NFD", form)): segments
            for (idx, concept, language, form, segments
                 ) in wl.iter_rows('concept', 'doculect', 'form', 'tokens')
        }
        data = self.raw_dir.read_csv('100item-phylo.Sheet1.csv', dicts=False)
        for i, row in progressbar(enumerate(data[4:])):
            number = row[0].strip().strip('.')
            concept = row[1].strip()
            for j in range(0, len(row) - 2, 2):
                language = data[2][j + 2]
                value = row[j + 2]
                if value.strip() and value.strip() not in ['-----']:
                    if ',' in row[j + 2]:
                        forms = [v.strip() for v in value.split(',')]
                        cogids = [
                            str(int(float(x)))
                            for x in row[j + 3].split(' or ')
                        ]
                    else:
                        forms = [value.strip()]
                        cogids = [str(int(float(row[j + 3].split(' or ')[0])))]

                    for form, cogid in zip(forms, cogids):
                        try:
                            segments = mapper[concept, languages[language],
                                              form]
                            lexeme = args.writer.add_form_with_segments(
                                Parameter_ID=concepts[number],
                                Language_ID=languages[language],
                                Value=value.strip(),
                                Form=form,
                                Segments=segments,
                                Source="Deepadung2015")
                        except:
                            args.log.warn(
                                'lexeme missing {0} / {1} / {2}'.format(
                                    concept, language, form))
                            lexeme = args.writer.add_form(
                                Parameter_ID=concepts[number],
                                Language_ID=languages[language],
                                Value=value.strip(),
                                Form=form,
                                Source="Deepadung2015")
                        args.writer.add_cognate(lexeme=lexeme,
                                                Cognateset_ID=cogid + '-' +
                                                number,
                                                Source="Deepadung2015")
class Dataset(BaseDataset):
    dir = Path(__file__).parent
    id = "wichmannmixezoquean"
    language_class = CustomLanguage

    form_spec = FormSpec(brackets={
        "(": ")",
        "[": "]"
    },
                         separators=",~",
                         missing_data=("?", "-"))

    def cmd_makecldf(self, args):
        args.writer.add_sources()

        languages = args.writer.add_languages(
            lookup_factory=lambda l: l["Abbreviation"])

        concepts = args.writer.add_concepts(
            id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english),
            lookup_factory="Name")
        # add multiple forms
        concepts.update({
            # note the mishmash of different dashes etc handled here.
            "hair - 1": "36_hair",
            "hair - 2": "36_hair",
            "see - 1": "72_see",
            "see - 2": "72_see",
            "stand - 1": "79_stand",
            "stand - 2": "79_stand",
            "stand -2": "79_stand",
            "walk/go - 1": "92_walkgo",
            "walk/go - 2": "92_walkgo",
            "worm - 1": "109_worm",
            "worm – 2": "109_worm",
            "worm - 2": "109_worm",
        })

        sources = {l["Abbreviation"]: l["Source"] for l in self.languages}

        data = zip(
            self.raw_dir.read_csv("Wordlist.txt", delimiter="\t"),
            self.raw_dir.read_csv("Cognates.txt", delimiter="\t"),
        )

        cogidx = 1
        header = None
        for i, (row1, row2) in enumerate(data):
            if i == 0:
                header = row1[1:]
            else:
                concept_id = concepts[row1[0].strip()]
                for lang_abbrev, word, cog in zip(header, row1[1:], row2[1:]):
                    if word.strip():
                        if cog.strip().lower() != "na":
                            cogid = concept_id + "-" + cog
                        else:
                            cogid = str(cogidx)
                            cogidx += 1

                        for row in args.writer.add_forms_from_value(
                                Language_ID=languages[lang_abbrev],
                                Parameter_ID=concept_id,
                                Value=word,
                                Source=sources[lang_abbrev],
                                Cognacy=cogid,
                        ):
                            args.writer.add_cognate(lexeme=row,
                                                    Cognateset_ID=cogid,
                                                    Source="Cysouw2006a")
Пример #12
0
class Dataset(BaseDataset):
    id = "castrosui"
    dir = Path(__file__).parent
    concept_class = CustomConcept
    language_class = CustomLanguage

    form_spec = FormSpec(separators=",")

    def cmd_makecldf(self, args):
        wl = self.raw_dir.read_csv("wordlist.tsv", delimiter="\t")
        concept_lookup = {}
        for concept in self.conceptlists[0].concepts.values():
            idx = concept.id.split('-')[-1] + '_' + slug(concept.english)
            args.writer.add_concept(
                ID=idx,
                Name=concept.english,
                Chinese_Gloss=concept.attributes["chinese"],
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
            )
            concept_lookup[concept.number.rjust(3, "0")] = [idx, concept]
        language_lookup = {k["ID_in_Source"]: k for k in self.languages}
        args.writer.add_languages()
        args.writer.add_sources()

        idx = 1
        mapping = {
            0: [
                "doculect",
                "doculectid",
                "glottocode",
                "concept",
                "glossid",
                "value",
                "phonetic",
                "concepticon_id",
                "concepticon_gloss",
            ]
        }

        for line in progressbar(wl, desc="load the data"):
            if not line[0].strip():
                phonetic = True
            if line[0] == "'Ref#":
                numbers = line
                phonetic = False
                idxs = defaultdict(list)
            elif line[0] == "Gloss":
                glosses = line
            elif line[0] in language_lookup and not phonetic:
                taxon = line[0]
                for num, gloss, val in zip(numbers[1:], glosses[1:], line[1:]):
                    if num.strip() and gloss.strip():
                        cname = concept_lookup[num[1:]][1]
                        if val:
                            mapping[idx] = [
                                language_lookup[taxon]["Name"],
                                taxon,
                                language_lookup[taxon]["Glottocode"],
                                cname.english,
                                num[1:],
                                val,
                                "",  # check later for phonetic value
                                cname.concepticon_id,
                                cname.concepticon_gloss,
                            ]

                            idxs[taxon, gloss] += [idx]
                            idx += 1

            elif line[0] in language_lookup and phonetic:
                taxon = line[0]
                for gloss, val in zip(glosses[1:], line[1:]):
                    if gloss.strip():
                        these_idx = idxs.get((taxon, gloss))
                        if not these_idx:
                            pass

        # export to lingpy wordlist in raw folder
        # Wordlist(mapping).output(
        #    "tsv", filename=self.dir.joinpath("raw", "lingpy-wordlist").as_posix()
        # )

        # add data to cldf
        for idx in progressbar(range(1, len(mapping)),
                               desc="cldfify",
                               total=len(mapping)):
            vals = dict(zip(mapping[0], mapping[idx]))

            args.writer.add_forms_from_value(
                Language_ID=language_lookup[vals["doculectid"]]["ID"],
                Parameter_ID=concept_lookup[vals["glossid"]][0],
                Value=vals["value"],
                Source=["Castro2015"],
            )

        # We explicitly remove the ISO code column since the languages in
        # this datasets do not have an ISO code.
        args.writer.cldf["LanguageTable"].tableSchema.columns = [
            col
            for col in args.writer.cldf["LanguageTable"].tableSchema.columns
            if col.name != "ISO639P3code"
        ]
Пример #13
0
class Dataset(BaseDataset):
    dir = Path(__file__).parent
    id = 'dunnaslian'

    form_spec = FormSpec(brackets={
        "[": "]",
        "{": "}",
        "(": ")"
    },
                         separators=";/,",
                         missing_data=('––', '--', '-'),
                         strip_inside_brackets=True,
                         replacements=[("…", "")])

    def cmd_makecldf(self, args):
        args.writer.add_sources()

        args.writer.add_languages(
            id_factory=lambda l: l['ID'].replace("'", ""))

        concepts = args.writer.add_concepts(
            id_factory=lambda c: c.id.split('-')[-1] + '_' + slug(c.english),
            lookup_factory="Name")

        # empty lines do not play well with dicts=True, unfortunately so we do
        # it the hard way
        header = None
        for row in self.raw_dir.read_csv(DATAFILE, delimiter="\t"):
            if row[0] == '':
                continue  # empty lines
            elif row[0] == 'WORD':
                header = row[1:]  # remove column 1 so it synchronises below
            else:
                assert header is not None, "header should not be empty here!"
                # headers look like this:
                #   WORD	animal		back		bad		belly		big
                # i.e. have an empty column for cognates.
                # data rows look like this:
                #   Ten'en_Palian	ʔay	0	kaʁɔʔ	0	gbaʔ	3	ʔɛc	0	caw	6
                #   Ten'en_Paborn	bsiŋ	6	kaʁɔʔ	0	baʔ	3	ʔec	0	ʔahaw	7
                #
                # so we grab the languages in the first cell:
                lang = row.pop(0)
                # and then loop over each pair of columns (gloss, cognate) and join.
                for i in range(0, 10, 2):
                    concept = concepts.get(header[i])
                    value = row[i].strip()

                    # get cognacy
                    cogs = COGNATE_OVERRIDES.get(
                        value, [_.strip() for _ in row[i + 1].split(",")])

                    # skip empty forms
                    if len(value) == 0:
                        continue

                    lex = args.writer.add_forms_from_value(
                        Language_ID=lang.replace("'", ""),
                        Parameter_ID=concept,
                        Value=value,
                        Source="DunnKruspeBurenhult2013",
                        Loan=any([is_loan(c) for c in cogs if c]))

                    # handle cognates
                    if len(lex) == 0:
                        continue  # no lexeme, no cognate

                    for i, cog in enumerate(cogs):
                        if cog is None or len(cog) == 0 or is_loan(cog):
                            # ignore empty cognates and loan words
                            continue
                        else:
                            # if we have one lexeme, multiple cognates then add all the
                            # cognates to lexeme[0]. These are:
                            #   wɔŋ ʔəhɔʔ = 9, 2
                            #   kuʔ cɔʔ = 8, 4
                            #   ɲɛ̤h ko̤h rao = 5, 8
                            #   kəbɘʔ ploʔ = 0,1
                            # ... otherwise we should have one cognate
                            # for each lexeme
                            o = lex[i] if len(lex) == cog else lex[0]
                            args.writer.add_cognate(
                                lexeme=o,
                                Cognateset_ID=concept + "_" + cog,
                                Source="DunnKruspeBurenhult2013")
Пример #14
0
class Dataset(Base):
    dir = pathlib.Path(__file__).parent
    id = "wold"
    lexeme_class = WoldLexeme
    language_class = WoldLanguage
    concept_class = WoldConcept
    form_spec = FormSpec(
        separators="~,",
        first_form_only=True,
        brackets={},  # each language is different, need to do manually
        replacements=[
            (" (1)", ""),
            (" (2)", ""),
            (" (3)", ""),
            (" (4)", ""),
            (" (5)", ""),
            (" (6)", ""),
            ("(f.)", ""),
            ("(1)", ""),
            ("(2)", ""),
            ("(3)", ""),
            ("(4)", ""),
            ("(5)", ""),
            ("(6)", ""),
            ("(2", ""),
            (" ", "_"),
        ],
    )

    def cmd_makecldf(self, args):
        self._schema(args)
        args.writer.add_sources()

        # add the languages from the language file
        # NOTE: the source lists all languages, including proto-languages,
        # but the `forms` only include the first 41 in the list
        language_lookup = args.writer.add_languages(lookup_factory="WOLD_ID")

        desc_dir = self.cldf_dir / 'descriptions'
        if not desc_dir.exists():
            desc_dir.mkdir()
        numentries = {
            r["pk"]: int(r["count_words"])
            for r in self.raw_dir.joinpath("db").read_csv("vocabulary.csv", dicts=True)
        }
        db_contribs = {
            r['id']: r
            for r in self.raw_dir.joinpath('db').read_csv('contribution.csv', dicts=True)}
        for contrib in self.raw_dir.read_csv("contributions.csv", dicts=True):
            db_contrib = db_contribs[contrib['ID']]
            args.writer.objects["ContributionTable"].append(
                dict(
                    ID=contrib["ID"],
                    Name="{} vocabulary".format(contrib["Name"]),
                    Citation=format_citation(contrib, numentries[contrib["ID"]]),
                    Contributor=contrib["Contributors"],
                    Number_of_words=numentries[contrib["ID"]],
                    Language_ID=language_lookup[contrib["ID"]],
                )
            )
            desc = vocabulary_description(
                contrib['Name'], contrib["Contributors"], json.loads(db_contrib['jsondata']))
            p = desc_dir.joinpath('vocabulary_{}.md'.format(contrib['ID']))
            p.write_text(desc, encoding='utf8')

        concepticon = {concept.attributes['wold_id']: concept for concept in self.conceptlists[0].concepts.values()}
        for parameter in self.raw_dir.read_csv("parameters.csv", dicts=True):
            concept = concepticon.get(parameter['ID'])
            args.writer.add_concept(
                ID=parameter['ID'],
                Name=concept.english if concept else parameter['Name'],
                Concepticon_ID=concept.concepticon_id if concept else None,
                Concepticon_Gloss=concept.concepticon_gloss if concept else None,
                Core_list=parameter['CoreList'] == 'true',
                Semantic_field=parameter['SemanticField'],
                Semantic_category=parameter['SemanticCategory'],
                Borrowed_score=float(parameter['BorrowedScore']),
                Age_score=float(parameter['AgeScore']) if parameter['AgeScore'] else None,
                Simplicity_score=float(parameter['SimplicityScore']),
            )

        form2lexeme = {}
        wid2fid = collections.defaultdict(set)
        lexemes_rows = self.raw_dir.read_csv("forms.csv", dicts=True)
        for row in progressbar(lexemes_rows):
            # Add information not in row, so we can pass to `add_form()`
            # with a single comprehension
            row["Language_ID"] = language_lookup[row["Language_ID"]]
            row["Parameter_ID"] = row["Parameter_ID"]
            row["Value"] = row.pop("Form")
            row["Loan"] = float(row["BorrowedScore"]) > 0.6
            row["Borrowed_score"] = row["BorrowedScore"]
            row["Simplicity_score"] = row["SimplicityScore"]
            row["original_script"] = normalize_text(row["original_script"])
            row["comment_on_borrowed"] = normalize_text(row["comment_on_borrowed"])
            row.pop("Segments")
            row['Age_score'] = decimal.Decimal(row.pop('AgeScore')) if row['AgeScore'] else None
            row['Age'] = row.pop('age_label')
            row['Local_ID'] = row['ID']
            row['contact_situation'] = row['ContactSituation']
            row['Comment'] = row.pop('other_comments')

            lexemes = args.writer.add_forms_from_value(
                **{k: v for k, v in row.items() if k in self.lexeme_class.fieldnames()}
            )
            assert len(lexemes) == 1
            form2lexeme[row['ID']] = lexemes[0]['ID']
            wid2fid[row['Word_ID']].add(lexemes[0]['ID'])

        words = {r['pk']: r for r in self.raw_dir.joinpath('db').read_csv('unit.csv', dicts=True)}
        languages = {r['pk']: r['name'] for r in self.raw_dir.joinpath('db').read_csv('language.csv', dicts=True)}
        codes = {r['pk']: r['name'] for r in self.raw_dir.joinpath('db').read_csv('identifier.csv', dicts=True) if r['type'] == 'glottolog'}
        glottocodes = {
            r['language_pk']: codes[r['identifier_pk']]
            for r in self.raw_dir.joinpath('db').read_csv('languageidentifier.csv', dicts=True)
            if r['identifier_pk'] in codes}

        wids = [w['id'] for w in words.values()]
        for wid in wid2fid:
            assert wid in wids

        count = 0
        for row in self.raw_dir.joinpath('db').read_csv('loan.csv', dicts=True):
            assert row['target_word_pk'] in words
            source_word = None
            if row ['source_word_pk']:
                assert row['source_word_pk'] in words
                source_word = words[row['source_word_pk']]
            twid = words[row['target_word_pk']]['id']
            for fid in wid2fid[twid]:
                # The meaning-differentiated borrowing events.
                count += 1
                args.writer.objects['BorrowingTable'].append(dict(
                    ID=str(count),
                    Target_Form_ID=fid,
                    Comment='Source word unidentifiable' if source_word['name'].lower() == 'unidentifiable' else None,
                    Source_word=None if source_word['name'].lower() == 'unidentifiable' else source_word['name'],
                    Source_meaning=source_word['description'] or None,
                    Source_languoid=languages[source_word['language_pk']],
                    Source_languoid_glottocode=glottocodes.get(source_word['language_pk']),
                    Source_relation=row['relation'],
                    Source_certain=row['certain'] == 't',
                ))

    def _schema(self, args):
        args.writer.cldf['FormTable'].common_props['dc:description'] = \
            "Word forms are listed as 'counterparts', i.e. as words with a specific meaning. " \
            "Thus, words with multiple meanings may appear more than once in this table."
        args.writer.cldf['FormTable', 'Comment'].common_props['dc:description'] = \
            "For more specific comments see 'comment_on_borrowed' and 'comment_on_word_form'"
        args.writer.cldf['FormTable', 'Word_ID'].valueUrl = URITemplate('https://wold.clld.org/word/{Word_ID}')
        args.writer.cldf.remove_columns('FormTable', 'Cognacy')

        t = args.writer.cldf.add_component(
            "ContributionTable",
            {
                "name": "Number_of_words",
                "datatype": "integer",
                "dc:description": "There would be 1814 words in each vocabulary, "
                                  "corresponding to the 1814 Loanword Typology meanings, if each meaning "
                                  "had exactly one counterpart, and if all the counterparts were "
                                  'different words. But many ("polysomous") words are counterparts of '
                                  "several meanings, many meanings have several word counterparts "
                                  '("synonyms", or "subcounterparts"), and many meanings have no '
                                  "counterparts at all, so the number of words in each database varies "
                                  "considerably.",
            },
            {
                "name": "Language_ID",
                "propertyUrl": "http://cldf.clld.org/v1.0/terms.rdf#languageReference",
                "dc:description": "References the language for which this contribution provides "
                                  "a vocabulary.",
            },
        )
        t.common_props['dc:description'] = \
            "WOLD contributions are vocabularies (mini-dictionaries of about 1000-2000 entries) " \
            "with comprehensive information about the loanword status of each word. " \
            "Descriptions of how these vocabularies coded the data can be found in the " \
            "[descriptions](descriptions/) directory."
        args.writer.cldf['ContributionTable', 'description'].valueUrl = URITemplate(
            './descriptions/vocabulary_{ID}.md')
        args.writer.cldf['ContributionTable', 'description'].common_props['dc:format'] = 'text/markdown'
        args.writer.cldf['ContributionTable', 'id'].common_props["dc:description"] = \
            "The vocabulary ID number corresponds to the ordering to the chapters on the book " \
            "Loanwords in the World's Languages. Languages are listed in rough geographical order " \
            "from west to east, from Africa via Europe to Asia and the Americas, so that " \
            "geographically adjacent languages are next to each other."
        args.writer.cldf['ContributionTable', 'citation'].common_props["dc:description"] = \
            "Each vocabulary of WOLD is a separate electronic publication with a separate author " \
            "or team of authors and should be cited as specified here."
        args.writer.cldf['ContributionTable', 'contributor'].common_props["dc:description"] = \
            "The authors are experts of the language and its history. They also contributed a " \
            "prose chapter on the borrowing situation in their language that was published in the " \
            "book Loanwords in the World's Languages."
        t.add_foreign_key("Language_ID", "languages.csv", "ID")

        t = args.writer.cldf.add_component(
            'BorrowingTable',
            {
                'name': 'Source_relation',
                'datatype': {'base': 'string', 'format': "immediate|earlier"},
                'dc:description':
                    "Whether a word was contributed directly (immediate) or indirectly (earlier), "
                    "i.e. via another, intermediate donor languoid, to the recipient language.",
            },
            'Source_word',
            'Source_meaning',
            {
                'name': 'Source_certain',
                'datatype': {'base': 'boolean', 'format': "yes|no"},
                'dc:description': "Certainty of the source identification",
            },
            {
                'name': 'Source_languoid',
                'dc:description': 'Donor languoid, specified as name of a language or language subgroup or family',
            },
            {
                'name': 'Source_languoid_glottocode',
                'dc:description': 'Glottocode of the source languid',
                'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#glottocode',
            }
        )
        t.common_props['dc:description'] = \
            'While a lot of information about the borrowing status is attached to the borrowed ' \
            'forms, the BorrowingTable lists information about (potential) source words. Note ' \
            'that we list loan events per meaning; i.e. one loanword may result in multiple ' \
            'borrowings if the word has multiple meanings.'
Пример #15
0
class Dataset(BaseDataset):
    id = "transnewguineaorg"
    dir = Path(__file__).parent

    @staticmethod
    def get_slug_from_uri(uri):
        return [_ for _ in uri.split("/") if _][-1]

    form_spec = FormSpec(
        brackets={
            "(": ")",
            "[": "]"
        },
        separators=";/,|<",
        missing_data=("?", "-", "*", "---", "-BB:SRP", '*-', '*'),
        strip_inside_brackets=True,
        replacements=[
            (" ", "_"),
            ('_+_modif.', ''),
            ('_+_verb', ''),
            ('_+_PL', ''),
            ('_+_mdf', ''),
            ('_+_mod', ''),
            ("_+_'make", ''),
            ("ɬ ̥", "ɬ̥"),
            ("l ̥", "l̥"),
            ('"', "'"),
            (" ?", ""),
            ("91)", ""),
            ("') :", ""),
            ("a ͥ", "aj"),
            ("<<̋>>"[2:-2], ""),
            (" ̟", ""),
        ],
    )

    def cmd_makecldf(self, args):
        languages = {
            o["slug"]: o
            for o in self.raw_dir.read_json(self.raw_dir / "languages.json")
        }
        words = {
            o["slug"]: o
            for o in self.raw_dir.read_json(self.raw_dir / "words.json")
        }
        sources = {
            o["slug"]: o
            for o in self.raw_dir.read_json(self.raw_dir / "sources.json")
        }
        # handle sources
        # want to make sure that the bibtex key matches our source id.
        for source in sorted(sources):
            # this is ugly, I wish pybtex made this easier!
            bib = parse_string(sources[source]["bibtex"], "bibtex")
            old_key = list(bib.entries.keys())[0]
            bib.entries[old_key].key = source
            bib.entries = OrderedCaseInsensitiveDict([(source,
                                                       bib.entries[old_key])])
            args.writer.add_sources(bib)

        # handle languages
        for lang in sorted(languages):
            args.writer.add_language(
                ID=lang,
                Name=languages[lang]["fullname"],
                ISO639P3code=languages[lang]["isocode"],
                Glottocode=languages[lang]["glottocode"],
            )

        # handle concepts
        concepts = {}
        for concept in self.conceptlists[0].concepts.values():
            idx = '{0}_{1}'.format(concept.number, slug(concept.english))
            args.writer.add_concept(
                ID=idx,
                Name=concept.english,
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss)
            concepts[concept.english] = idx
            concepts[concept.english.replace(" ", "-")] = idx
            concepts[concept.english.replace(" ", "-").lower()] = idx
            concepts[slug(concept.english)] = idx
            concepts["-".join([slug(x)
                               for x in concept.english.split()])] = idx

            if '(' in concept.english:
                new_string = concept.english[:concept.english.index('(') - 1]
                concepts["-".join([slug(x) for x in new_string.split()])] = idx
                concepts[concept.english[:concept.english.index('(') -
                                         1]] = idx
                concepts[concept.english[:concept.english.index('(') -
                                         1].replace(' ', '-').lower()] = idx
            if concept.english.startswith("to "):
                new_string = concept.english[3:]
                concepts['-'.join([slug(x) for x in new_string.split()])] = idx
                concepts[concept.english.replace("to ", "")] = idx
        concepts["mans-mother-law"] = concepts["man's mother in law"]
        concepts["brother-law"] = concepts["brother in law"]
        concepts["to-make-hole"] = concepts["make hole (in ground)"]
        concepts["front"] = concepts["in front"]
        concepts["husk-nut"] = concepts["husk (of nut)"]
        concepts["his"] = concepts["his, hers, its (pronoun p:3s)"]
        concepts["we-two-incl"] = concepts[
            "we incl. dual (pronoun d:1p, incl, dual)"]
        concepts["intrnasitivizer"] = concepts["intransitivizer"]
        concepts["short-piece-wood"] = concepts["short-piece-of-wood"]
        concepts["top-foot"] = concepts["top (of foot)"]
        concepts["sit-feet-and-legs-together"] = concepts[
            "sit (with feet and legs together)"]
        concepts["earth"] = concepts["earth/soil"]
        concepts["warm"] = concepts["warm/hot"]
        concepts["your-sg"] = concepts["your (pronoun: p:2s)"]
        concepts["-law"] = concepts["in-law"]
        concepts["to-roast"] = concepts["roast"]
        concepts["arrow-barred"] = concepts[
            "arrow (barred) (Arrow with cross bar)"]
        concepts["them-dual"] = concepts["them (pronoun o:3p, dual)"]
        concepts["you-dual"] = concepts["you (pronoun d:2s)"]
        concepts["right-correct"] = concepts["right (correct, true)"]
        concepts["betelpepper"] = concepts["betelpepper vine"]
        concepts["to-chop"] = concepts["to chop, cut down"]
        concepts["road"] = concepts["road/path"]
        concepts["for-benefactive-clitic"] = concepts[
            "for (benefactive) ((cliticised or suffixed to noun))"]
        concepts["mans-father-law"] = concepts["mans' father in law"]
        concepts["sister-law"] = concepts["sister in law"]
        concepts["you-o2s"] = concepts["you (pronoun o:2s)"]
        concepts["you-pl-o2p"] = concepts["you pl. (pronoun o:2p)"]
        concepts["we-pl-incl"] = concepts["we incl. (pronoun d:1p, incl)"]
        concepts["in"] = concepts["in, inside"]
        concepts["not_know"] = concepts["not know"]
        concepts["their-dual"] = concepts["their (pronoun p:3p, dual)"]
        concepts["blow-fire"] = concepts["blow (on fire)"]
        concepts["blunt-eg-knife"] = concepts["blunt (of e.g. knife)"]
        concepts["our-dual"] = concepts["our (two) (pronoun p:1p, dual)"]
        concepts["your-pl-dual"] = concepts[
            "your (two) pl (pronoun p:2p, dual)"]
        concepts["suck-breast"] = concepts["to suck at breast"]
        concepts["draw-water-carry"] = concepts["draw water / carry"]
        concepts["tree-sp-Gnetum-gnemon"] = concepts[
            "tree sp. (Gnetum gnemon)"]
        concepts["he-she"] = concepts["he, she, it, that, those"]
        concepts["fed"] = concepts["fed up (with)"]
        concepts["you-pl-dual-o2p"] = concepts[
            "you plural two (pronoun d:2p, dual)"]
        concepts["you-pl-dual"] = concepts["you two (pronoun d:2s, dual)"]
        concepts["to-put"] = concepts["to put, give"]
        concepts["he-she-it-those"] = concepts["he, she, it, that, those"]
        concepts["we-two-excl"] = concepts[
            "we excl. dual (pronoun d:1p, excl, dual)"]
        concepts["we-pl-excl"] = concepts[
            "we excl. plural (pronoun d:1p, excl, plural)"]
        #concepts["affix-body-part"] = concepts[""]

        itemfiles = [
            f for f in self.raw_dir.iterdir() if f.name.startswith("language-")
        ]
        errors = set()
        for filename in progressbar(sorted(itemfiles), desc="adding lexemes"):
            for o in sorted(self.raw_dir.read_json(filename),
                            key=lambda d: d["id"]):
                wordid = self.get_slug_from_uri(o['word'])
                if wordid in concepts:
                    args.writer.add_forms_from_value(
                        Local_ID=o["id"],
                        Language_ID=self.get_slug_from_uri(o["language"]),
                        Parameter_ID=concepts[wordid],
                        Value=o["entry"],
                        Source=self.get_slug_from_uri(o["source"]),
                        Comment=o["annotation"],
                    )
                else:
                    errors.add(("concept", wordid))
        for error in errors:
            args.log.info("error with {0[0]}: {0[1]}".format(error))

        args.log.info("found {0} errors in concepts".format(len(errors)))

    def get_all(self, url):
        """Helper function to iterate across the API's _next_ commands for a given URL"""
        while True:
            j = get_url(url).json()
            yield j["objects"]
            if not j["meta"]["next"]:
                break
            url = BASE_URL + j["meta"]["next"]

    def cmd_download(self, args):
        if not self.raw_dir.exists():
            self.raw_dir.mkdir()

        for fname in self.raw_dir.iterdir():
            remove(fname)

        # sources
        sources = []
        for j in self.get_all(SOURCES_URL % {"limit": LIMIT}):
            sources.extend(j)
        jsondump(sources, self.raw_dir / "sources.json", args.log)

        # languages
        languages = []
        for j in self.get_all(LANGUAGES_URL % {"limit": LIMIT}):
            languages.extend(j)
        jsondump(languages, self.raw_dir / "languages.json", args.log)

        # words
        words = []
        for j in self.get_all(WORDS_URL % {"limit": LIMIT}):
            words.extend(j)
        jsondump(words, self.raw_dir / "words.json", args.log)

        # items
        for language in languages:
            items = []
            for j in self.get_all(RECORDS_URL % {
                    "limit": LIMIT,
                    "language": language["id"]
            }):
                items.extend(j)
            jsondump(
                items,
                self.raw_dir / ("language-%d.json" % language["id"]),
                args.log,
            )

        # version information
        with open(self.raw_dir / "version.txt", "w") as handle:
            handle.write(str(datetime.now()))
Пример #16
0
class Dataset(BaseDataset):
    dir = pathlib.Path(__file__).parent
    id = "huntergatherer"
    lexeme_class = HGLexeme
    concept_class = HGConcept
    form_spec = FormSpec(missing_data=("?", "[missing]", "missing", "#NAME?",
                                       "X", "[absent]", "-", "--", "..."))

    def _get(self, path, log):
        with self.raw_dir.temp_download(self.metadata.url + path, ".html",
                                        log) as fname:
            return BeautifulSoup(fname.read_text(encoding="utf8"),
                                 "html.parser")

    def cmd_download(self, args):
        for a in self._get("/languages", args.log).find_all("a", href=True):
            if a["href"].startswith("/languages/language/"):
                parse(self._get(a["href"], args.log), a["href"].split("/")[-1],
                      self.raw_dir)

    @staticmethod
    def get_tokenizer():
        return lambda x, y: ipa2tokens(y, merge_vowels=False)

    def cmd_makecldf(self, args):
        concepts = args.writer.add_concepts(
            id_factory=lambda x: x.id.split("-")[-1] + "_" + slug(x.english),
            lookup_factory="Database_ID",
        )

        language_map = {
            lang["ID"]: lang["Glottocode"] or None
            for lang in self.languages
        }

        sources = {}
        for path in sorted(self.raw_dir.glob("*.json"),
                           key=lambda _p: int(_p.stem)):
            data = jsonlib.load(path)
            iso = data.get("ISO 639-3")
            if iso:
                iso = iso.strip()
            args.writer.add_language(
                ID=data["id"],
                Name=data["name"],
                ISO639P3code=iso if iso not in {"no", "XXX"} else None,
                Glottocode=language_map[data["id"]],
            )

            for table in ["basic", "flora", "cult"]:
                if table not in data["tables"]:
                    continue
                for item in data["tables"][table]["rows"]:
                    item = dict(zip(data["tables"][table]["header"], item))
                    form = item["Orthographic Form"].strip()
                    if form:
                        refs = [
                            ref for ref in itersources(item, data, sources)
                            if ref
                        ]
                        args.writer.add_sources(*[ref.source for ref in refs])
                        href, _ = item["English"]

                        concept_database_id = href.split("/")[-1]

                        if not concepts.get(concept_database_id):
                            # https://huntergatherer.la.utexas.edu/lexical/feature/729
                            # is missing from the concept list(s)
                            continue

                        args.writer.add_lexemes(
                            Language_ID=data["id"],
                            Parameter_ID=concepts[concept_database_id],
                            Value=form,
                            Loan=bool(item["Loan Source"]
                                      or item["Wanderwort Status"]),
                            Phonemic=item["Phonemicized Form"] or None,
                            Source=["%s" % ref for ref in refs],
                            Creator=item.get("Created By"),
                            Comment=item.get("General Notes"),
                        )
Пример #17
0
class Dataset(BaseDataset):
    dir = Path(__file__).parent
    id = "gaotb"
    language_class = CustomLanguage
    concept_class = CustomConcept
    form_spec = FormSpec(
        missing_data=("---",),
        separators="/;",
        replacements=[
            (" ", "_"), ('\u0306', ''), ('\u0329', ''), ('\u0303', ''),
            ('\u0325', ''), ('\u0335', ''), ('\u0331', '')],
        first_form_only=True,
    )

    def cmd_makecldf(self, args):
        """
        Convert the raw data to a CLDF dataset.
        """
        concepts = {}
        for concept in self.conceptlists[0].concepts.values():
            cid = '{0}_{1}'.format(concept.number, slug(concept.english))
            args.writer.add_concept(
                ID=cid,
                Name=concept.english,
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
                Number=concept.number
            )
            concepts[concept.number] = cid
        args.log.info('[i] added concepts')
        languages = args.writer.add_languages(lookup_factory="Number")
        args.log.info('[i] added languages')
        args.writer.add_sources()
        
        missingL, missingC = set(), set()
        missingCog = set()
        cogids = {}
        for row in progressbar(
                self.raw_dir.read_csv('data.tsv', delimiter='\t', dicts=True)):
            lid = languages.get(row['LANGUAGE'])
            cid = concepts.get(row['SID'])
            # take only the first cognate ID if there are several
            cog = row['COGNATE'].split('|')[0]
            if lid and cid and row["FORM"] and row["FORM"].strip():
                lexemes = args.writer.add_forms_from_value(
                    Language_ID=lid,
                    Parameter_ID=cid,
                    Value=row["FORM"],
                    Source='Sun1991'
                )
                if cog.strip():
                    cogid = cid+'-'+cog
                    args.writer.add_cognate(
                            lexeme=lexemes[0],
                            Cognateset_ID=cogid,
                            Cognate_Detection_Method='expert',
                            Source='Gao2020'
                            )
                else:
                    missingCog.add(cogid)

            if not lid:
                missingL.add(lid)
            if not cid:
                missingC.add(cid)
        for entry in missingL:
            print('missing L {0}'.format(entry))
        for entry in missingC:
            print('missing C {0}'.format(entry))
        for entry in missingCog:
            print('missing Cognate {0}'.format(entry))
class Dataset(BaseDataset):
    dir = Path(__file__).parent
    id = "pharaocoracholaztecan"
    concept_class = CustomConcept
    form_spec = FormSpec(
        separators="/",
        first_form_only=False,
        brackets={"’": "’", "(": ")"},
        replacements=[("*", ""), (" ", "_")],
    )

    def cmd_makecldf(self, args):
        # parse the data from the word document
        table = [[""]]  # we except 9 columns
        with open(self.raw_dir.joinpath("data.txt").as_posix()) as f:
            previous = []
            for i, line in enumerate(f):
                rows = [c.strip() for c in line.split("\t")]
                if rows[0].replace(".", "").isdigit():
                    table += [rows]
                else:
                    table[-1][-1] += "/" + rows[0]
                    table[-1] += rows[1:]
        # load cognates
        cognates = self.raw_dir.read_csv("cognates.tsv", delimiter="\t")[1:]
        concepts = {}
        for concept in self.conceptlists[0].concepts.values():
            idx = "{0}-{1}".format(concept.number, slug(concept.english))
            args.writer.add_concept(
                ID=idx,
                Name=concept.english,
                Spanish_Gloss=concept.attributes["spanish"],
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
            )
            for gloss in concept.attributes["lexibank_gloss"]:
                concepts[gloss] = idx
        concepts["Frio/(hace frio)"] = concepts["Frio (hace frio)"]
        args.log.info("added concepts")

        args.writer.add_sources()
        cognacy, counter = {}, 1
        cogsets = {
            "A(B)": ["A"],
            "A/(B)": ["A"],
            "A/B": ["A", "B"],
            "A/B/C": ["A", "B", "C"],
            "A/B/D": ["A", "B", "D"],
            "A/B?": ["A"],
            "A/C": ["A", "C"],
            "B/(A)": ["A"],
            "B/(a)": ["B"],
            "B/C": ["B", "C"],
            "C D": ["C", "D"],
            "C/(B)": ["C"],
            "C/B": ["C", "B"],
            "C/E": ["C", "E"],
            "D/B": ["D", "B"],
            "a/(B)": ["A"],
            "a/A": ["A", "A"],
            "a/B": ["A", "B"],
            "ab": ["A", "B"],
        }
        languages = args.writer.add_languages(lookup_factory="Name")
        for i, line in progressbar(enumerate(table[1:])):
            for j, (language, cell) in enumerate(zip(table[0][2:], line[2:])):
                if cell.strip():

                    cognatesets = cogsets.get(
                        cognates[i][j + 1].strip(), [cognates[i][j + 1].strip().upper()]
                    )

                    for lexeme, cognate in zip(
                        args.writer.add_forms_from_value(
                            Value=cell,
                            Language_ID=languages[language],
                            Parameter_ID=concepts[line[1]],
                            Source=["Pharao2020"],
                        ),
                        cognatesets,
                    ):
                        if cognate in ["?", "-"]:
                            cid = counter
                            counter += 1
                        else:
                            cid = "{0}-{1}".format(i, cognate)
                            if cid in cognacy:
                                cid = cognacy[cid]
                            else:
                                cognacy[cid] = counter
                                cid = cognacy[cid]
                                counter += 1
                        if languages[language] == "ProtoUtoAztecan" and "SUA" in cell.strip():
                            lexeme["Language_ID"] = languages["SUA"]

                        args.writer.add_cognate(lexeme, Cognateset_ID=cid, Source=["Pharao2020"])
Пример #19
0
class Dataset(BaseDataset):
    id = "tppsr"
    dir = Path(__file__).parent
    concept_class = CustomConcept
    language_class = CustomLanguage
    lexeme_class = CustomLexeme
    form_spec = FormSpec(first_form_only=True,
                         missing_data=("#NAME?", ),
                         replacements=[("- - ", "-"), (" - ", "-"),
                                       ("- ", "-"), (" -", "")])

    def cmd_makecldf(self, args):
        args.writer.add_sources()

        # We can link forms to scans of the page in the source where they appear:
        args.writer.cldf["FormTable", "Scan"].valueUrl = URITemplate(
            'https://cdstar.shh.mpg.de/bitstreams/{Objid}/gauchat_et_al_1925_tppsr_{Scan}.png'
        )
        for c in ['Population', 'SpeakerAge']:
            args.writer.cldf['LanguageTable', c].datatype.base = 'integer'
            args.writer.cldf['LanguageTable', c].datatype.minimum = 0

        values = self.raw_dir.read_csv('tppsr-db-v20.txt', delimiter='\t')
        forms = self.raw_dir.read_csv('tppsr-db-v20-ipa-narrow.txt',
                                      delimiter='\t')

        concepts = {}
        for concept in self.conceptlists[0].concepts.values():
            idx = '{0}_{1}'.format(concept.id,
                                   slug(concept.attributes['french']))
            args.writer.add_concept(
                ID=idx,
                Number=concept.number,
                Name=concept.english,
                French_Gloss=concept.attributes['french'],
                Latin_Gloss=concept.attributes['latin'],
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss)
            concepts[concept.number] = (idx, concept.attributes['page'],
                                        concept.attributes['french'])

        languages = args.writer.add_languages(lookup_factory='Number')

        def scan_number(bitstreams):
            p = re.compile(r'tppsr_(?P<number>[0-9]{4})\.png')
            for bs in bitstreams:
                m = p.search(bs['bitstreamid'])
                if m:
                    return m.group('number')

        scans = {
            scan_number(o['bitstreams']): objid
            for objid, o in self.raw_dir.read_json('tppsr_scans.json').items()
        }

        phrase_data = collections.defaultdict(dict)
        for row1, row2 in progressbar(zip(values, forms), desc='cldfify'):
            entry = row1[2]
            for s, t in [('\u0320', '')]:
                entry = entry.replace(s, t)
            tokens = self.tokenizer({},
                                    entry.strip().replace(' ', '_'),
                                    column='IPA')
            # Compute scan number from concept number and language number.
            page = int(concepts[row1[0]][1]) + int(int(row1[1]) > 31)
            scan = str(page + 18).rjust(4, '0')

            if row1[2].replace('_', '').replace('-', '').strip():
                phrase_data[row1[1]][row1[0]] = (row2[2], row1[2])
                args.writer.add_form_with_segments(
                    Value=row1[2],
                    Form=''.join(tokens),
                    Segments=tokens,
                    Profile=' '.join(
                        self.tokenizer({}, entry.strip(), column='Grapheme')),
                    Source=['Gauchat1925[{0}]'.format(page)],
                    Language_ID=languages[row1[1]],
                    Parameter_ID=concepts[row1[0]][0],
                    Scan=scan,
                    Objid=scans[scan],
                    ProsodicStructure=prosodic_string(tokens, _output='CcV'),
                    SegmentedValue=' '.join(
                        self.tokenizer({}, entry, column='Graphemes')))

        args.writer.cldf.add_component(
            'ExampleTable',
            'Alt_Transcription',
            {
                "name":
                "Concept_ID",
                "separator":
                " ",
                "propertyUrl":
                "http://cldf.clld.org/v1.0/terms.rdf#parameterReference",
            },
            {
                "name": "Form_ID",
                "separator": " ",
                "propertyUrl":
                "http://cldf.clld.org/v1.0/terms.rdf#formReference",
            },
        )
        args.writer.cldf.add_foreign_key('ExampleTable', 'Concept_ID',
                                         'ParameterTable', 'ID')
        args.writer.cldf.add_foreign_key('ExampleTable', 'Form_ID',
                                         'FormTable', 'ID')

        for phrase in self.etc_dir.read_csv('phrases.csv', dicts=True):
            for lid, data in sorted(phrase_data.items(), key=lambda i: i[0]):
                lid = languages[lid]
                cids = phrase['Concepts'].split()
                try:
                    args.writer.objects['ExampleTable'].append(
                        dict(
                            ID='{}-{}'.format(phrase['ID'], lid),
                            Language_ID=lid,
                            Primary_Text=' '.join(
                                [data[cid][0] for cid in cids]),
                            Translated_Text=' '.join(
                                [concepts[cid][2] for cid in cids]),
                            Alt_Transcription=' '.join(
                                [data[cid][1] for cid in cids]),
                            Concept_ID=[concepts[cid][0] for cid in cids],
                            Form_ID=[
                                '{}-{}-1'.format(lid, concepts[cid][0])
                                for cid in cids
                            ],
                        ))
                except KeyError:
                    pass
Пример #20
0
class Dataset(BaseDataset):
    dir = Path(__file__).parent
    id = "peirosaustroasiatic"
    language_class = CustomLanguage
    lexeme_class = CustomLexeme
    cross_concept_cognates = True
    form_spec = FormSpec(separators=("/", ","),
                         strip_inside_brackets=True,
                         brackets={
                             "[": "]",
                             "(": ")",
                             "<": ">"
                         })

    def cmd_makecldf(self, args):
        # add sources
        args.writer.add_sources()
        # add concepts
        concepts = args.writer.add_concepts(
            id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english),
            lookup_factory="Name")
        # fix concept
        concepts["fat (n.)"] = concepts["fat n."]
        concepts["burn (tr.)"] = concepts["burn tr."]
        concepts["to fly"] = concepts["fly v."]
        concepts["lie (down)"] = concepts["lie"]
        concepts["walk (go)"] = concepts["walk(go)"]
        args.log.info("added concepts")
        # add languages
        languages = {}
        for language in self.languages:
            args.writer.add_language(**language)
            languages[language["Name"]] = language["ID"]
        args.log.info("added languages")
        # add data
        for row_ in progressbar(
                self.raw_dir.read_csv("Peiros2004-data by etymology.txt",
                                      delimiter="\t")):
            if "".join(row_).strip():
                row = dict(
                    zip(["CONCEPT", "SUBGROUP", "LANGUAGE", "FORM", "COGNACY"],
                        row_))
                bsource = ""
                if row["COGNACY"].isdigit():
                    cogid = int(row["COGNACY"])
                elif row["COGNACY"].startswith("<"):
                    bsource = row["COGNACY"].split(" ")[1]
                    cogid = 0
                else:
                    cogid = 0

                for lexeme in args.writer.add_forms_from_value(
                        Parameter_ID=concepts[re.sub("'", "", row["CONCEPT"])],
                        Language_ID=languages[row["LANGUAGE"].strip()],
                        Value=row["FORM"].strip(),
                        Source=["Peiros2004a"],
                        LoanSource=bsource,
                        Loan=True if bsource else False,
                ):
                    args.writer.add_cognate(lexeme,
                                            Cognateset_ID=cogid,
                                            Source=["Peiros2004a"])
class Dataset(BaseDataset):
    id = "davletshinaztecan"
    dir = Path(__file__).parent
    concept_class = CustomConcept
    language_class = CustomLanguage
    form_spec = FormSpec(
        missing_data=["*", "---", "-"],
        separators=";/,~",
        strip_inside_brackets=True,
        replacements=[(" ", "_")],
        brackets={"(": ")"},
        first_form_only=True,
    )

    def cmd_makecldf(self, args):
        # Add bibliographic sources and collect them
        args.writer.add_sources()
        sources, languages = {}, {}
        for language in self.languages:
            sources[language["NameInData"]] = language["Source"]
            languages[language["NameInData"]] = language["ID"]
            args.writer.add_language(**language)

        # Add concepts and collecte them
        concepts, proto = {}, {}
        for concept in self.conceptlists[0].concepts.values():
            idx = "{0}_{1}".format(concept.number, slug(concept.english))
            args.writer.add_concept(
                ID=idx,
                Name=concept.english,
                ProtoAztecan=concept.attributes["proto_aztecan"],
                Number=concept.number,
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
            )
            concepts[concept.number] = idx
            proto[concept.number] = concept.attributes['proto_aztecan']

        cogidx = 0
        with open(self.raw_dir.joinpath("data.txt").as_posix()) as f:
            for line in progressbar(f, desc="cldfify"):
                number, concept = line.split(" :: ")[0].split(". ")
                entries = re.split(r"(\(-*[0-9]\))[,\.]*",
                                   line.split(" :: ")[1])
                cogids, count, borrowing = [], 0, False
                for i in range(0, len(entries) - 1, 2):
                    entry = entries[i].strip()
                    cogid = int(entries[i + 1][1:-1])
                    if cogid < 0:
                        borrowing = True
                        cogid = len(entries) + count
                        count += 1
                    language = entry.split(" ")[0]
                    value = " ".join(entry.split(" ")[1:])
                    for lex in args.writer.add_forms_from_value(
                            Language_ID=languages[language],
                            Parameter_ID=concepts[number],
                            Value=value,
                            Source=[sources[language]],
                            Loan=borrowing,
                    ):
                        args.writer.add_cognate(
                            lexeme=lex,
                            Cognateset_ID=cogid + cogidx,
                            Source="Davletshin2012",
                        )
                    cogids += [cogid]

                # add proto-aztecan form
                if proto[number].strip() != "?":
                    for lex in args.writer.add_forms_from_value(
                            Language_ID=languages["PA"],
                            Parameter_ID=concepts[number],
                            Value=proto[number],
                            Source=sources["PA"],
                    ):
                        args.writer.add_cognate(
                            lexeme=lex,
                            Cognateset_ID=sorted(
                                cogids,
                                key=lambda x: cogids.count(x),
                                reverse=True,
                            )[0] + cogidx,
                        )
                        cogids += [cogid]

                cogidx += max(cogids)
Пример #22
0
class Dataset(BaseDataset):
    dir = Path(__file__).parent
    id = "holmie"
    language_class = CustomLanguage
    concept_class = CustomConcept
    form_spec = FormSpec(
        missing_data=("-", ),
        separators="/,;",
        replacements=[(" ", "_")],
        strip_inside_brackets=False,
        first_form_only=True,
        brackets={},
    )

    def cmd_makecldf(self, args):
        """
        Convert the raw data to a CLDF dataset.
        """
        concepts, wl_concepts = {}, {}
        visited = set()
        for concept in self.concepts:
            cid = '{0}_{1}'.format(concept['NUMBER'], slug(concept['ENGLISH']))
            if cid in visited:
                pass
            else:
                visited.add(cid)
                args.writer.add_concept(
                    ID=cid,
                    Name=concept['ENGLISH'],
                    Glosses_in_Source=concept['GLOSSES_IN_SOURCE'],
                    Concepticon_ID=concept['CONCEPTICON_ID'],
                    Concepticon_Gloss=concept['CONCEPTICON_GLOSS'])
                for gloss in concept['GLOSSES_IN_SOURCE'].split(' // '):
                    concepts[gloss] = cid
                    wl_concepts[gloss] = concept['ENGLISH']

        languages = args.writer.add_languages(lookup_factory="Name_in_Source")
        args.writer.add_sources()

        # make a wordlist for edictor to inspect the data
        D = {0: ['doculect', 'concept', 'ipa', 'cogid']}
        idx = 1

        for i, row in progressbar(
                enumerate(
                    self.raw_dir.read_csv('data.tsv',
                                          delimiter='\t',
                                          dicts=True))):
            for language, lid in languages.items():
                form = row[language].strip()
                if form:
                    lexemes = args.writer.add_forms_from_value(
                        Language_ID=lid,
                        Parameter_ID=concepts[row['Meaning']],
                        Value=form,
                        Source='Holm2017')
                    if lexemes:
                        args.writer.add_cognate(
                            lexeme=lexemes[0],
                            Cognateset_ID=str(i + 1),
                            Cognate_Detection_Method='expert',
                            Source='Holm2017')
                        D[idx] = [
                            language, wl_concepts[row['Meaning']], form, i + 1
                        ]
                        idx += 1
        Wordlist(D).output(
            'tsv', filename=self.raw_dir.joinpath('wordlist').as_posix())
Пример #23
0
class Dataset(BaseDataset):
    """
    Defines the dataset for Lieberherr and Bodt (2017).
    """

    id = "lieberherrkhobwa"
    dir = Path(__file__).parent
    language_class = KBLanguage
    form_spec = FormSpec(separators="~/,;ткд", missing_data=("NA", ))

    def cmd_download(self, **kw):
        """
        Download the raw zipped data and extract it.
        """
        zip_url = (
            "https://zenodo.org/api/files/5469d550-938a-4dae-b6d9-50e427f193b3/"
            "metroxylon/subgrouping-kho-bwa-v1.0.0.zip")

        self.raw_dir.download(zip_url, "kho-bwa-v1.0.0.zip")

    def cmd_makecldf(self, args):
        # Add bibliographic sources
        args.writer.add_sources()

        # Read raw concept data and add to dataset; at the same time,
        # build a map between the concept index as used in data and the
        # concept id in the dataset
        concept_lookup = {}
        for cidx, concept in enumerate(self.conceptlists[0].concepts.values()):
            concept_cldf_id = (concept.id.split("-")[-1] + "_" +
                               slug(concept.english))
            concept_lookup[1 + (cidx * 2)] = concept_cldf_id

            # Add the concept
            args.writer.add_concept(
                ID=concept_cldf_id,
                Name=concept.english,
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
            )

        # Add languages and make a map for individual sources
        language_lookup = args.writer.add_languages(
            lookup_factory="Source_Name")
        source_lookup = {
            entry["Source_Name"]: entry["Source"]
            for entry in self.languages
        }

        # Read raw data and remove headers and rows with reconstructions
        # (row[0] not in languages)
        data = self.raw_dir.read_csv("dataset_khobwa.csv")
        data = data[2:]
        data = [row for row in data if row[0] in language_lookup]

        # iterate over the source adding lexemes and collecting cognates
        for row in progressbar(data, desc="makecldf"):
            for cid in range(1, len(row), 2):
                # Skip over rows with empty fields for cogid
                if not row[cid + 1]:
                    continue

                # Compute a cognate_id number; lingpy now requires
                # this to be an integer
                cognate_id = cid * 100 + int(row[cid + 1])

                # Extract the value from the raw data, skipping over
                # missing or non-existing forms. We need to strip here,
                # as there are entries with newlines and FormSpec, as the
                # name implies, does not apply to values.
                value = row[cid].strip()
                for lex in args.writer.add_lexemes(
                        Language_ID=language_lookup[row[0]],
                        Parameter_ID=concept_lookup[cid],
                        Value=value,
                        Cognacy=cognate_id,
                        Source=source_lookup[row[0]],
                ):
                    args.writer.add_cognate(
                        lexeme=lex,
                        Cognateset_ID=cognate_id,
                        Source="Lieberherr2017",
                    )