示例#1
0
def get_language_vectors_URIEL(features_set, language_families):
    """Returns the URIEL language vectors for a given feature set.

	Args:
		features_set (str): The name of the feature set.
		language_families (dict of str:str): Mappings from language names to their phylogenetic path.
	
	Returns:
		list of (list of float): The language-property matrix. Languages are indices, properties are columns.
		list of str: The list of languages (= row names).
		list of str: The list of properties (= column names).
	
	"""
    exceptions = {
        'aii': 'Assyrian',
        'arb': 'Arabic',
        'bho': 'Bhojpuri',
        'chu': 'Old Church Slavonic',
        'ell': 'Greek',
        'fro': 'Old French',
        'gla': 'Scottish Gaelic',
        'grc': 'Ancient Greek',
        'gsw': 'Swiss German',
        'gun': 'Mbya Guarani',
        'hsb': 'Upper Sorbian',
        'kmr': 'Kurmanji',
        'koi': 'Komi Permyak',
        'kpv': 'Komi Zyrian',
        'pcm': 'Naija',
        'pes': 'Persian',
        'sme': 'North Sami',
        'uig': 'Uyghur',
        'yue': 'Cantonese'
        # ??? : 'Buryat',
        # ??? : 'Classical Chinese',
        # ??? : 'Livvi',
        # ??? : 'Hindi English',
        # ??? : 'Old Russian',
        # ??? : 'Skolt Sami',
        # ??? : 'Swedish Sign Language',
    }
    vectors = {}
    for code in tqdm.tqdm(l2v.LANGUAGES):
        try:
            language = iso639.languages.get(part3=code).name
            language = language_families[language.replace(
                " ", "_")] + ", " + language
        except KeyError:
            if code in exceptions.keys():
                language = exceptions[code]
                language = language_families[language.replace(
                    " ", "_")] + ", " + language
            else:
                continue
        values = l2v.get_features(code, features_set, header=False)[code]
        vectors[language] = [(None if v == "--" else v) for v in values]
    properties = l2v.get_features("eng", features_set, header=True)["CODE"]
    languages = sorted(vectors.keys())
    matrix = [vectors[language] for language in languages]
    return matrix, languages, properties
示例#2
0
def load_features_l2v(features="learned",
                      comb_op="+",
                      source="",
                      verbose=False):
    '''
    '''
    if features not in all_sets_names:
        print("Error: invalid feature set")
        return

    #Building the expected name
    if features in comb_sets:
        features = comb_op.join([s for s in databases_sets if features in s])

    print("Feature:", features)
    if source == "": source = "learned" if features == "learned" else "uriel"
    print("Source:", source)

    try:
        return np.load(PATH_FEATS + features + "<" + source + ".npy")
    except:

        list_lang = get_list_languages(source)
        dict_feats = {}
        for i, l in enumerate(list_lang):
            if i % 100 == 0: print(" lang.", i, "of", len(list_lang))
            dict_feats[l] = l2v.get_features(l, features)[l]
        np.save(PATH_FEATS + features + "<" + source + ".npy", dict_feats)
        return dict_feats
示例#3
0
def add_syntax_feats(df):
    langs = set(df["Target Language Code"]).union(df["Source Language Code"])
    source_columns = ["syntax_src_" + str(i) for i in range(103)]
    target_columns = ["syntax_tgt_" + str(i) for i in range(103)]

    for column in source_columns + target_columns:
        df[column] = 0

    # collect syntax features for all langs
    d = {}
    for lang in langs:
        lang = lang_codes[lang]
        features = l2v.get_features(
            lang,
            l2v.fs_concatenation([
                l2v.fs_union(
                    ["syntax_wals", "syntax_sswl", "syntax_ethnologue"])
            ]))
        d[lang] = features[lang]

    # add all syntax features to dataframe
    for index in df.index:
        src = df.loc[index, "Source Language Code"]
        tgt = df.loc[index, "Target Language Code"]
        df.loc[index, source_columns] = d[lang_codes[src]]
        df.loc[index, target_columns] = d[lang_codes[tgt]]
        print(f"Collected syntactic features for {src} and {tgt}.")

    return df
示例#4
0
def augment_langvec(task, column):
    feats = [
        "syntax_average", "phonology_average", "inventory_average", "geo",
        "fam"
    ]
    dims = [103, 28, 158, 299, 158]

    df = pd.read_csv(
        os.path.join(sys.argv[1], "data", "data_{}.csv".format(task)))

    for i, feat in enumerate(feats):
        for j in range(langvec_lens[feat]):
            df[feat + "_{}".format(j)] = 0

    for index, lang in zip(df.index, df.loc[:, column]):
        features = l2v.get_features(lang, "+".join(feats))[lang]
        print("Loading langvec for {} ..".format(lang))
        for j, feat in enumerate(feats):
            start = sum(dims[:j])
            end = sum(dims[:j + 1])
            df.loc[index, [feat + "_" + str(i)
                           for i in range(dims[j])]] = features[start:end]
    df.to_csv(os.path.join(sys.argv[1],
                           "data/data_lemma_new.csv".format(task)),
              index=0)
示例#5
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument("--feature_name", required=True)
    parser.add_argument("--classify_method",
                        required=True,
                        choices=["svm", "logistic"])
    parser.add_argument("--lang_vec_path", required=True)
    parser.add_argument("--lang_name_path", required=True)
    parser.add_argument("--output_file_path", required=True)

    args, unknown = parser.parse_known_args()

    lang_name = read_data(args.lang_name_path)
    lang_vec = load_lang_vec(args.lang_vec_path)

    lang_alpha3 = {}

    for lang in lang_name:
        alpha3 = get_language_alpha3(lang[1:-1])
        if check_alpha3(alpha3):
            lang_alpha3[lang] = alpha3

    feature_name = args.feature_name
    features = l2v.get_features(list(lang_alpha3.values()),
                                feature_name,
                                header=True)

    train(args, lang_vec, lang_alpha3, features)
示例#6
0
def load_features(space_name, source_name):

    # source_name : ["Bib22", "Bib24", "Tan23", "Own23", "Own25", "Fact23", "Fact25"]
    label_total = None
    X_total = []
    #print(space_name, source_name)
    if space_name == "uriel":
        feat_set_uriel = source_name + "<" + space_name  # syntax_knn<uriel
        X_npdict = np.load(PATH_FEATS + feat_set_uriel + ".npy")
        label_total = list(get_list_languages(source="uriel"))
        #print(len(label_total))

    elif space_name == "learn":
        if source_name in ["Bib22", "Bib24", "Bible"]:
            feat_set_learn = "learned<learned"
            X_npdict = np.load(PATH_FEATS + feat_set_learn + ".npy")
            if source_name == "Bible":
                label_total = list(get_list_languages(source="learned"))
            elif source_name == "Bib22":
                label_total = wit3_bib22_ids
            elif source_name == "Bib24":
                label_total = wit3_bib24_ids
            #label_total[label_total.index("zho")] = "cmn" # unifying Chinese (zho, cmn)

        #elif source_name in ["Tan23", "Own23", "Fact23", "Own25", "Fact25", "Fact23-A", "Fact23-B"] or source_name.startswith("ted"):
        else:
            learn_emb_file = "data/embeddings/%s.npy" % source_name.split(
                "-")[0]
            X_npdict = np.load(learn_emb_file)
            label_total = sorted(list(X_npdict.item().keys()))

    for l in label_total:
        #    if source_name == "Fact23-A":
        #        X_total.append(X_npdict.item().get(l)[:256])
        #    elif source_name == "Fact23-B":
        #       X_total.append(X_npdict.item().get(l)[256:])
        #    else:
        X_total.append(X_npdict.item().get(l))

    # tag 2 iso codes
    #if source_name in ["Own23", "Fact23", "Own25", "Fact25", "Fact23-A", "Fact23-B"]:
    #    label_total = [wit3_map[l] for l in label_total]

    if "zho" in label_total:
        label_total[label_total.index("zho")] = "cmn"
    if "hbo" in label_total:
        label_total[label_total.index("hbo")] = "heb"

    # adding extra iso codes in URIEL (they are there!!!!!)
    if space_name == "uriel":
        list_extra_uriel = ["aze", "mon", "sqi"]
        import lang2vec.lang2vec as l2v
        X_extra_dict = l2v.get_features(list_extra_uriel, source_name)
        X_extra = []
        for l in list_extra_uriel:
            label_total.append(l)
            X_extra.append(X_extra_dict[l])
        X_total = np.concatenate((X_total, np.array(X_extra)), axis=0)

    return np.array(X_total), label_total
示例#7
0
    def _cache_language_features(self):

        features = dict()
        for lang in self.in_language_list + self.oov_language_list:
            features[lang] = l2v.get_features(
                l2v.LETTER_CODES[lang],
                self.config.language_features)[l2v.LETTER_CODES[lang]]
        self.l2v_cache = features
示例#8
0
def get_feature_dist_dicts():
    distance_dict = {}
    for feature_type in FEAT_LIST:
        distance_dict[feature_type] = l2v.get_features(lang_codes_3,
                                                       feature_type,
                                                       minimal=False)
        stacked_features = stack_features(distance_dict[feature_type])
        distance_dict[feature_type] = 1 - cosine_similarity(
            stacked_features, stacked_features)

        np.fill_diagonal(distance_dict[feature_type], 0)
    return distance_dict
示例#9
0
def prepare_new_dataset(lang,
                        task="SA",
                        dataset_source=None,
                        dataset_target=None,
                        dataset_subword_source=None,
                        dataset_subword_target=None):
    features = {}
    features["lang"] = lang

    # Get dataset features
    if dataset_source is None and dataset_target is None and dataset_subword_source is None and dataset_subword_target is None:
        # print("NOTE: no dataset provided. You can still use the ranker using language typological features.")
        return features
    elif dataset_source is None:  # and dataset_target is None:
        # print("NOTE: no word-level dataset provided, will only extract subword-level features.")
        pass
    elif dataset_subword_source is None:  # and dataset_subword_target is None:
        # print("NOTE: no subword-level dataset provided, will only extract word-level features.")
        pass

    source_lines = []
    if isinstance(dataset_source, str):
        with open(dataset_source) as inp:
            source_lines = inp.readlines()
    elif isinstance(dataset_source, list):
        source_lines = dataset_source
    else:
        raise Exception(
            "dataset_source should either be a filnename (str) or a list of sentences."
        )
    if source_lines:
        features["dataset_size"] = len(source_lines)
        tokens = [w for s in source_lines for w in s.strip().split()]
        features["token_number"] = len(tokens)
        types = set(tokens)
        features["type_number"] = len(types)
        features["word_vocab"] = types
        features["type_token_ratio"] = features["type_number"] / float(
            features["token_number"])

    # read all pos features even if we might not use everything
    features["verb_ratio"] = pos_features(lang, 'verb')
    features["pron_ratio"] = pos_features(lang, 'pron')

    code = {'ara': 'arb', 'fas': 'pes'}
    lang = code.get(lang, lang)
    if lang == 'eng':
        features["learned"] = np.zeros(512)  # 512
    else:
        features["learned"] = np.array(
            l2v.get_features(lang, 'learned')[lang])  # 512
    return features
示例#10
0
def augment_bli_features():
    langs = pd.read_csv("data/data_bli.csv")["Target Language Code"].unique()
    df = pd.DataFrame(columns=["syntax_" + str(i) for i in range(103)])
    for lang in langs:
        features = l2v.get_features(
            lang,
            l2v.fs_concatenation([
                l2v.fs_union(
                    ["syntax_wals", "syntax_sswl", "syntax_ethnologue"])
            ]))
        df.loc[lang] = features[lang]
        print(lang + " done!")
    df.to_csv("data/bli_tgt_feats.csv")
def load_lang2vec_vectors(task="xnli", features=[]):
    "returns a matrix of lang vecs"
    vecs_matrix = []
    langs = get_langs_for_task(task)
    try:
        lang2vec = l2v.get_features(langs, features, minimal=True)
        for lang in langs:
            vecs_matrix.append(lang2vec[lang])
        null_columns = detect_null_columns(vecs_matrix)
        if len(null_columns) == len(vecs_matrix[0]):
            return None
        elif len(null_columns) > 0:
            vecs_matrix = np.delete(vecs_matrix, null_columns, axis=1)
        assert len(vecs_matrix) == len(langs)
        return np.array(vecs_matrix)
    except Exception as e:
        return None
def get_feature(src_lang="ug",lang=["eng"], prop="syntax_wals", printout=False, missing_accounting=False):
    readable = OrderedDict()
    if lang=="all":
        lang = list(l2v.available_languages())
    #elif src_lang is not None:
    lang.append(src_lang)

    dic = l2v.get_features(lang, prop,header=True)
    code = dic["CODE"]

    for key in dic:
        if key != "CODE":
            readable[key] = [(_code, val) for _code, val in zip(code,dic[key]) if val == 1]
            dic[key] = np.array(dic[key])
    if printout:
        print(readable)
    
    if src_lang is not None:
        intersect_with_src_rate= OrderedDict()
        intersect_with_src_ls = OrderedDict()
        disjoin_in_target_rate = OrderedDict()
        disjoin_with_lang = OrderedDict()
        disjoin_in_target = OrderedDict()
        intersect_over_union = OrderedDict()
        for _lang in lang:

            if missing_accounting:
                intersect_with_src_rate[_lang+"-"+src_lang] = np.round(np.sum(dic[src_lang][dic[src_lang]==dic[_lang]]=="1.0")/len(dic[src_lang]),decimals=3)
                intersect_with_src_ls[_lang+"-"+src_lang] = [(_code, val) for _code, val, val_2 in zip(code,dic[src_lang], dic[_lang]) if val == 1 and val==val_2]
            else:
                intersect_over_union[_lang+"-"+src_lang] = np.round(np.sum(dic[src_lang][dic[src_lang]==dic[_lang]]=="1.0")/len(dic[src_lang]),decimals=3)
                intersect_with_src_rate[_lang+"-"+src_lang] = np.round(np.sum(dic[src_lang][dic[src_lang]==dic[_lang]]=="1.0")/np.sum(dic[src_lang]=="1.0"),decimals=3)
                intersect_with_src_ls[_lang+"-"+src_lang] = [_code for _code, val, val_2 in zip(code,dic[src_lang], dic[_lang]) if val_2==val and val=="1.0"]
                # hard disjoint : if target is 1 and src is UNK --> considered disjoint
                disjoin_in_target[_lang+"-"+src_lang] = [_code for _code, val, val_2 in zip(code,dic[src_lang], dic[_lang]) if val_2!=val and val_2=="1.0"]
                disjoin_in_target_rate[_lang+"-"+src_lang]  = np.round(np.sum(dic[_lang][dic[src_lang]!=dic[_lang]]=="1.0")/np.sum(dic[_lang]=="1.0"),decimals=3)
                #pdb.set_trace()
                #np.round(np.sum(dic[src_lang][dic[src_lang]!=dic[_lang]]=="1.0")/np.sum(dic[src_lang]=="1.0"),decimals=3)
                # hard disjoint : if src is 1 and target is UNK --> considered disjoint
                disjoin_with_lang[_lang+"-"+src_lang] = [_code for _code, val, val_2 in zip(code,dic[src_lang], dic[_lang]) if val_2!=val and val=="1.0"]
                #npreadable[lang]
    return readable, dic, intersect_with_src_rate, intersect_with_src_ls, disjoin_in_target, disjoin_in_target_rate, disjoin_with_lang, intersect_over_union
args = parser.parse_args()

#['S_SVO', 'S_SOV', 'S_VSO', 'S_VOS', 'S_OVS', 'S_OSV', 'S_SUBJECT_BEFORE_VERB', 'S_SUBJECT_AFTER_VERB',
# 'S_OBJECT_AFTER_VERB', 'S_OBJECT_BEFORE_VERB', 'S_SUBJECT_BEFORE_OBJECT', 'S_SUBJECT_AFTER_OBJECT',
# 'S_GENDER_MARK', 'S_SEX_MARK', 'S_DEFINITE_AFFIX', 'S_DEFINITE_WORD', 'S_INDEFINITE_AFFIX', 'S_INDEFINITE_WORD',
# 'S_POSSESSIVE_PREFIX', 'S_POSSESSIVE_SUFFIX', 'S_ADPOSITION_BEFORE_NOUN', 'S_ADPOSITION_AFTER_NOUN',
# 'S_POSSESSOR_BEFORE_NOUN', 'S_POSSESSOR_AFTER_NOUN', 'S_ADJECTIVE_BEFORE_NOUN', 'S_ADJECTIVE_AFTER_NOUN',
# 'S_DEMONSTRATIVE_WORD_BEFORE_NOUN', 'S_DEMONSTRATIVE_WORD_AFTER_NOUN', 'S_DEMONSTRATIVE_PREFIX',
# 'S_DEMONSTRATIVE_SUFFIX', 'S_NUMERAL_BEFORE_NOUN', 'S_NUMERAL_AFTER_NOUN', 'S_RELATIVE_BEFORE_NOUN',
# 'S_RELATIVE_AFTER_NOUN', 'S_RELATIVE_AROUND_NOUN', 'S_NOMINATIVE_VS_ACCUSATIVE_MARK',
# 'S_ERGATIVE_VS_ABSOLUTIVE_MARK', 'S_NEGATIVE_WORD_BEFORE_VERB', 'S_NEGATIVE_PREFIX',
# 'S_NEGATIVE_WORD_AFTER_VERB', 'S_NEGATIVE_SUFFIX', 'S_NEGATIVE_WORD_BEFORE_SUBJECT',
# 'S_NEGATIVE_WORD_AFTER_SUBJECT', 'S_NEGATIVE_WORD_BEFORE_OBJECT', 'S_NEGATIVE_WORD_AFTER_OBJECT',
# 'S_NEGATIVE_WORD_INITIAL', 'S_NEGATIVE_WORD_FINAL', 'S_NEGATIVE_WORD_ADJACENT_BEFORE_VERB',
# 'S_NEGATIVE_WORD_ADJACENT_AFTER_VERB', 'S_PLURAL_PREFIX', 'S_PLURAL_SUFFIX', 'S_PLURAL_WORD',
# 'S_OBJECT_HEADMARK', 'S_OBJECT_DEPMARK', 'S_POSSESSIVE_HEADMARK', 'S_POSSESSIVE_DEPMARK', 'S_TEND_HEADMARK',
# 'S_TEND_DEPMARK', 'S_TEND_PREFIX', 'S_TEND_SUFFIX', 'S_ANY_REDUP', 'S_CASE_PREFIX', 'S_CASE_SUFFIX',
# 'S_CASE_PROCLITIC', 'S_CASE_ENCLITIC', 'S_CASE_MARK', 'S_COMITATIVE_VS_INSTRUMENTAL_MARK', 'S_NUMCLASS_MARK',
# 'S_ADJECTIVE_WITHOUT_NOUN', 'S_PERFECTIVE_VS_IMPERFECTIVE_MARK', 'S_PAST_VS_PRESENT_MARK', 'S_FUTURE_AFFIX',
# 'S_TAM_PREFIX', 'S_TAM_SUFFIX', 'S_DEGREE_WORD_BEFORE_ADJECTIVE', 'S_DEGREE_WORD_AFTER_ADJECTIVE', 'S_POLARQ_MARK_INITIAL', 'S_POLARQ_MARK_FINAL', 'S_POLARQ_MARK_SECOND', 'S_POLARQ_WORD', 'S_POLARQ_AFFIX', 'S_SUBORDINATOR_WORD_BEFORE_CLAUSE', 'S_SUBORDINATOR_WORD_AFTER_CLAUSE', 'S_SUBORDINATOR_SUFFIX', 'S_PROSUBJECT_WORD', 'S_PROSUBJECT_AFFIX', 'S_PROSUBJECT_CLITIC', 'S_NEGATIVE_AFFIX', 'S_NEGATIVE_WORD', 'S_ANY_AGREEMENT_ON_ADJECTIVES', 'S_COMPLEMENTIZER_WORD_BEFORE_CLAUSE', 'S_COMPLEMENTIZER_WORD_AFTER_CLAUSE', 'S_VOX', 'S_XVO', 'S_XOV', 'S_OXV', 'S_OVX', 'S_OBLIQUE_AFTER_VERB', 'S_OBLIQUE_AFTER_OBJECT', 'S_OBLIQUE_BEFORE_VERB', 'S_OBLIQUE_BEFORE_OBJECT', 'S_ARTICLE_WORD_BEFORE_NOUN', 'S_ARTICLE_WORD_AFTER_NOUN']

fout = open(args.output, "w")
lang_list = args.langs.split()
features = l2v.get_features(lang_list, "syntax_wals")
for lang, feats in features.items():
    selected_feats = feats[:12] + feats[20:26]
    features[lang] = [
        "2.0" if isinstance(f, str) else str(f) for f in selected_feats
    ]
    fout.write(lang + "\t" + " ".join(features[lang]) + "\n")
示例#14
0
import lang2vec.lang2vec as l2v

FEATURES = 'syntax_average+phonology_average+inventory_average'
LANGUAGES = 'sv zh ja it ko eu en fi tr hi ru ar he akk aii bho krl koi kpv olo mr mdf sa tl wbp yo gsw am bm be br bxr yue myv fo kk kmr gun pcm ta te hsb cy'

features_avg = l2v.get_features(LANGUAGES, FEATURES)

missing_features = open('missing_features.txt', 'w', encoding='utf-8')
for lang, feat in features_avg.items():
    nof = feat.count('--')
    missing_features.write('{} : {}\n'.format(lang, nof))

missing_features.close()
示例#15
0
def get_kNN_langs(k=10,
                  geodesic=True,
                  genetic=True,
                  source="uriel",
                  geo_or_gen=False,
                  save_all=True):
    if geodesic and not genetic:
        try:
            dict_geodesic = np.load(PATH_FEATS + str(k) + "_geodesic_" +
                                    source + ".npy")
            return dict_geodesic
        except:
            print("File to be processed...")
    if genetic and not geodesic:
        try:
            dict_genetic = np.load(PATH_FEATS + str(k) + "_genetic_" + source +
                                   ".npy")
            return dict_genetic
        except:
            print("File to be processed...")
    if genetic and geodesic:
        try:
            dict_geo_gen = np.load(PATH_FEATS + str(k) + "_geo&gen_" + source +
                                   ".npy")
            return dict_geo_gen
        except:
            print("File to be processed...")

    list_lang = get_list_languages(source=source)
    #test purposes
    #list_lang = list_lang[:10]

    geodesic_feats = {}
    genetic_feats = {}

    geodesic_dist = {}
    genetic_dist = {}
    geo_gen_dist = {}

    num_lang = len(list_lang)
    print("Getting features for languages....")
    i = 0
    for l in list_lang:
        if geodesic:
            geodesic_feats[l] = l2v.get_features(l, "geo").get(l)
            if not genetic: geodesic_dist[l] = {}
        if genetic:
            genetic_feats[l] = l2v.get_features(l, "fam").get(l)
            if not geodesic: genetic_dist[l] = {}
        if geodesic and genetic:
            geo_gen_dist[l] = {}
        i += 1
        if (i % 100 == 0): print("  %04d/%d" % (i, num_lang))

    if save_all:
        if (geodesic and not genetic) or geo_or_gen:
            np.save(PATH_FEATS + "feats_geodesic_" + source + ".npy",
                    geodesic_feats)
        if (genetic and not geodesic) or geo_or_gen:
            np.save(PATH_FEATS + "feats_genetic_" + source + ".npy",
                    genetic_feats)

    print("Computing distances....")
    i = 0
    for l1 in list_lang:
        if geodesic:
            l1_geo = geodesic_feats.get(l1)
        if genetic:
            l1_gen = genetic_feats.get(l1)
        for l2 in list_lang:
            if l2 != l1:
                if (geodesic and not genetic) or geo_or_gen:
                    l2_in_dist1 = l2 in geodesic_dist.get(l1)
                    l1_in_dist2 = l1 in geodesic_dist.get(l2)
                    if not l2_in_dist1 and not l1_in_dist2:
                        dist = np.linalg.norm(l1_geo - geodesic_feats.get(l2))
                        geodesic_dist[l1][l2] = dist
                        geodesic_dist[l2][l1] = dist
                    elif not l2_in_dist1:
                        geodesic_dist[l1][l2] = geodesic_dist.get(l2).get(l1)
                    elif not l1_in_dist2:
                        geodesic_dist[l2][l1] = geodesic_dist.get(l1).get(l2)

                if (genetic and not geodesic) or geo_or_gen:
                    l2_in_dist1 = l2 in genetic_dist.get(l1)
                    l1_in_dist2 = l1 in genetic_dist.get(l2)
                    if not l2_in_dist1 and not l1_in_dist2:
                        dist = np.linalg.norm(l1_gen - genetic_feats.get(l2))
                        genetic_dist[l1][l2] = dist
                        genetic_dist[l2][l1] = dist
                    elif not l2_in_dist1:
                        genetic_dist[l1][l2] = genetic_dist.get(l2).get(l1)
                    elif not l1_in_dist2:
                        genetic_dist[l2][l1] = genetic_dist.get(l1).get(l2)

                if geodesic and genetic:
                    l2_in_dist1 = l2 in geo_gen_dist.get(l1)
                    l1_in_dist2 = l1 in geo_gen_dist.get(l2)
                    if not l2_in_dist1 and not l1_in_dist2:
                        dist = np.linalg.norm(
                            np.concatenate((l1_geo, l1_gen)) -
                            np.concatenate((geodesic_feats.get(l2),
                                            genetic_feats.get(l2))))
                        geo_gen_dist[l1][l2] = dist
                        geo_gen_dist[l2][l1] = dist
                    elif not l2_in_dist1:
                        geo_gen_dist[l1][l2] = geo_gen_dist.get(l2).get(l1)
                    elif not l1_in_dist2:
                        geo_gen_dist[l2][l1] = geo_gen_dist.get(l1).get(l2)
        i += 1
        if (i % 100 == 0): print("  %04d/%d" % (i, num_lang))

    if save_all:
        if (geodesic and not genetic) or geo_or_gen:
            np.save(PATH_FEATS + "dist_geodesic_" + source + ".npy",
                    geodesic_dist)
        if (genetic and not geodesic) or geo_or_gen:
            np.save(PATH_FEATS + "dist_genetic_" + source + ".npy",
                    genetic_dist)
        if genetic and geodesic:
            np.save(PATH_FEATS + "dist_geo&gen_" + source + ".npy",
                    geo_gen_dist)

    geodesic_knn = {}
    genetic_knn = {}
    geo_gen_knn = {}

    print("Ordering by distance....")
    i = 0
    for l1 in list_lang:
        if (geodesic and not genetic) or geo_or_gen:
            d = geodesic_dist[l1]
            sorted_dist = [(ks, d[ks]) for ks in sorted(d, key=d.__getitem__)]
            i = 0
            geodesic_knn[l1] = []
            for ks, vs in sorted_dist:
                geodesic_knn[l1].append((ks, vs))
                i += 1
                if (i == k): break
        if (genetic and not geodesic) or geo_or_gen:
            d = genetic_dist[l1]
            sorted_dist = [(ks, d[ks]) for ks in sorted(d, key=d.__getitem__)]
            i = 0
            genetic_knn[l1] = []
            for ks, vs in sorted_dist:
                genetic_knn[l1].append((ks, vs))
                i += 1
                if (i == k): break
        if genetic and geodesic:
            d = geo_gen_dist[l1]
            sorted_dist = [(ks, d[ks]) for ks in sorted(d, key=d.__getitem__)]
            i = 0
            geo_gen_knn[l1] = []
            for ks, vs in sorted_dist:
                geo_gen_knn[l1].append((ks, vs))
                i += 1
                if (i == k): break
        i += 1
        if (i % 100 == 0): print("  %04d/%d" % (i, num_lang))

    if (geodesic and not genetic) or geo_or_gen:
        np.save(PATH_FEATS + str(k) + "_geodesic_" + source + ".npy",
                geodesic_knn)
        return np.load(PATH_FEATS + str(k) + "_geodesic_" + source + ".npy")
    if (genetic and not geodesic) or geo_or_gen:
        np.save(PATH_FEATS + str(k) + "_genetic_" + source + ".npy",
                geodesic_knn)
        return np.load(PATH_FEATS + str(k) + "_genetic_" + source + ".npy")
    np.save(PATH_FEATS + str(k) + "_geo&gen_" + source + ".npy", geo_gen_knn)
    return np.load(PATH_FEATS + str(k) + "_geo&gen_" + source + ".npy")
示例#16
0
    key = "conll_" + language
    features[key] = {}
    features[key]["lang"] = language
    features[key]["dataset_size"] = lines

    unique = list(c)
    # Get number of types and tokens
    features[key]["token_number"] = len(all_words)
    features[key]["type_number"] = len(unique)
    features[key]["word_vocab"] = unique
    features[key]["type_token_ratio"] = features[key]["type_number"] / float(
        features[key]["token_number"])

    # features[key]["noun_ratio"] = pos_features(language, 'noun')
    features[key]["verb_ratio"] = pos_features(language, 'verb')
    features[key]["pron_ratio"] = pos_features(language, 'pron')
    # features[key]["n2v_ratio"] = pos_features(language, 'noun2verb')
    # features[key]["p2n_ratio"] = pos_features(language, 'pron2noun')

    code = {'ara': 'arb', 'fas': 'pes'}
    language = code.get(language, language)
    if language == 'eng':
        features[key]["learned"] = np.zeros(512)  # 512
    else:
        features[key]["learned"] = np.array(
            l2v.get_features(language, 'learned')[language])

indexed = "indexed/DEP"
outputfile = os.path.join(indexed, "conll.npy")
np.save(outputfile, features)
示例#17
0
def load_families(list_langs):
    import lang2vec.lang2vec as l2v
    # extracting the one-hot-vectors of families
    X_families = l2v.get_features(list_langs, "fam", header=True)
    # family names
    list_fam_names = X_families['CODE']
    # one-hot-codes
    X_fam_data = []
    for lang in list_langs:
        X_fam_data.append(X_families[lang])
    X_fam_data = np.array(X_fam_data)
    # extracting the families with at least one entry
    X_dict_fam = {}
    for idx_fam, fam in enumerate(X_fam_data.T):
        # if it has at least one entry with 1 ...
        if max(fam) == 1:
            fam_name = list_fam_names[idx_fam]
            X_dict_fam[fam_name] = []
            # identify the languages that belongs to that family group
            for idx_lang, lang_value in enumerate(fam):
                if lang_value == 1:
                    X_dict_fam[fam_name].append(list_langs[idx_lang])

    # reducing the "duplicated" family groups: with the same members
    family_values = [" ".join(sorted(group)) for group in X_dict_fam.values()]
    # we sort the groups by LEN for further processing
    lang_groups = sorted(list(set(family_values)), key=len)
    # converting back to a list, for easier processing
    lang_groups_lists = [group.split() for group in lang_groups]
    # if the last entry has more than half of the entries, we truncate it: (Indo-European)
    if len(lang_groups_lists[-1]) > len(list_langs) / 2:
        lang_groups_lists = lang_groups_lists[:-1]
    '''
    reverse_idx = -1
    while len(lang_groups_lists[reverse_idx]) > 50:
        reverse_idx -= 1
    reverse_idx += 1
    if reverse_idx < 0:
        lang_groups_lists = lang_groups_lists[:reverse_idx]
    '''
    # obtaining the final list of family groups that are not included in bigger groups
    final_list = []
    for idx, group in enumerate(lang_groups_lists):
        # flag to identify if a group is included in a posterior one (pre-sorting is a must)
        flag_included = False
        for other_group in lang_groups_lists[idx + 1:]:
            # if all elements of other_group includes elements from group
            if all(elem in other_group for elem in group):
                flag_included = True
                break
        # if the group is not included, we should keep it
        if not flag_included:
            final_list.append(group)
    # obtaining the final names
    final_dict = {}
    for g in final_list:
        # we asume that the family names are sorted in priority/inclusivity (e.g. Italic before Romance)
        for key, val in X_dict_fam.items():
            # we compare at string level...
            if " ".join(g) == " ".join(sorted(val)):
                final_dict[key] = g
                break
    #for key, val in final_dict.items():
    #    print(key, len(val))

    return final_dict