def __init__(self, language, ablate=None): """ Define basic properties Args: language(str): language of input data """ self.language = language if (self.language == 'english'): self.u_prob = file_io.read_file('data/external/english_u_prob.csv' ) #should be in data/external if (self.language == 'spanish'): self.u_prob = file_io.read_file('data/external/spanish_u_prob.csv') #if (self.language == 'german'): # self.u_prob = file_io.read_file('data/external/german_u_prob.csv') # Loading the spacy vocab for tokenisation. if self.language == "english": self.nlp = spacy.load('en_core_web_lg') elif self.language == "spanish": self.nlp = spacy.load("es_core_news_md") elif self.language == "german": self.nlp = spacy.load('de_core_news_sm') elif self.language == "french": self.nlp = spacy.load('fr_core_news_md') self.ablate = ablate # load pyphen stuff here #TODO """Build a Frequency Index reference for spanish language""" """if self.language == 'spanish':
def __init__(self, language): """ Define basic properties Args: language(str): language of input data """ self.language = language if (self.language == 'english'): self.u_prob = file_io.read_file('data/external/english_u_prob.csv') #should be in data/external if (self.language == 'spanish'): print('reading unigram probs') self.u_prob = file_io.read_file('data/external/spanish_u_prob.csv') # Loading the spacy vocab for tokenisation. if self.language == "english": self.nlp = spacy.load('en_core_web_lg') elif self.language == "spanish": self.nlp = spacy.load("es_core_news_md") elif self.language == "german": self.nlp = spacy.load('de_core_news_sm') elif self.language == "french": self.nlp = spacy.load('fr_core_news_md') # load pyphen stuff here #TODO """Build a Frequency Index reference for spanish language""" if self.language == 'spanish': self.esp_freq_index = {} with open("data/external/spanish_subtitle_words_frequency_indexes.txt", "r", encoding="utf-8") as f: for line in f.readlines(): wd = line.split(",")[0] FI = int(line.split(",")[1]) self.esp_freq_index[wd] = FI
def __init__(self, language=None, ablate=None, features_to_use=None): """ Define basic properties Args: language(str): language of input data features_to_use: a list of string named features to use """ # This dict contains all available features, along with a list of their # high-computing-power requirements e.g. ['spacy'] feature_requirements = { 'is_nounphrase': ['spacy'], 'len_tokens_norm': ['spacy'], 'hypernym_count': None, 'len_chars_norm': None, 'len_tokens': None, 'len_syllables': ['hyph'], 'consonant_freq': None, 'gr_or_lat': ['affix'], 'is_capitalised': None, 'num_complex_punct': None, 'avg_chars_p_word': None, 'sent_length': None, 'unigram_prob': ['unigram_probs'], 'char_n_gram_feats': None, 'sent_n_gram_feats': None, 'iob_tags': ['spacy'], 'lemma_feats': ['spacy'], 'bag_of_shapes': ['spacy'], 'pos_tag_counts': ['spacy'], 'NER_tag_counts': ['spacy'], } if features_to_use == None or features_to_use == 'all': features_to_use = list(feature_requirements.keys()) # Total requirements is a unique list of all the requirements. self.total_requirements = set() final_features = [] for feature in features_to_use: # Making sure that we know about the feature if feature in feature_requirements.keys(): if feature_requirements[feature] is not None: for requirement in feature_requirements[feature]: self.total_requirements.add(requirement) final_features.append(feature) else: print( "{} did not match any of the features in feature_requirements, so was not used." .format(feature)) self.features_to_use = final_features self.affixes = {} self.spacy_models = { 'english': None, 'spanish': None, 'german': None, 'french': None } self.hyph_dictionaries = { 'english': None, 'spanish': None, 'german': None, 'french': None } self.unigram_prob_dict = { 'english': None, 'spanish': None, 'german': None, 'french': None } # So that we're only opening this file once. if 'affix' in self.total_requirements: self.affixes = affix_features.get_affixes() if language == 'english': if 'spacy' in self.total_requirements: self.spacy_models = {'english': spacy.load('en_core_web_lg')} if 'hyph' in self.total_requirements: self.hyph_dictionaries = {'english': pyphen.Pyphen(lang='en')} if 'unigram_probs' in self.total_requirements: self.unigram_prob_dict = { 'english': file_io.read_file('data/external/english_u_prob.csv') } elif language == 'spanish': if 'spacy' in self.total_requirements: self.spacy_models = {'spanish': spacy.load('es_core_news_md')} if 'hyph' in self.total_requirements: self.hyph_dictionaries = {'spanish': pyphen.Pyphen(lang='es')} if 'unigram_probs' in self.total_requirements: self.unigram_prob_dict = { 'spanish': file_io.read_file('data/external/spanish_u_prob.csv') } elif language == 'german': if 'spacy' in self.total_requirements: self.spacy_models = {'german': spacy.load('de_core_news_sm')} if 'hyph' in self.total_requirements: self.hyph_dictionaries = {'german': pyphen.Pyphen(lang='de')} if 'unigram_probs' in self.total_requirements: self.unigram_prob_dict = { 'german': file_io.read_file('data/external/german_u_prob.csv') } elif language == 'french': if 'spacy' in self.total_requirements: self.spacy_models = {'french': spacy.load('fr_core_news_md')} if 'hyph' in self.total_requirements: self.hyph_dictionaries = {'french': pyphen.Pyphen(lang='fr')} if 'unigram_probs' in self.total_requirements: self.unigram_prob_dict = { 'french': file_io.read_file('data/external/french_u_prob.csv') } else: if 'spacy' in self.total_requirements: self.spacy_models = { 'english': spacy.load('en_core_web_lg'), 'spanish': spacy.load("es_core_news_md"), 'german': spacy.load('de_core_news_sm'), 'french': spacy.load('fr_core_news_md') } if 'hyph' in self.total_requirements: self.hyph_dictionaries = { 'english': pyphen.Pyphen(lang='en'), 'spanish': pyphen.Pyphen(lang='es'), 'german': pyphen.Pyphen(lang='de'), 'french': pyphen.Pyphen(lang='fr') } if 'unigram_probs' in self.total_requirements: self.unigram_prob_dict = { 'english': file_io.read_file('data/external/english_u_prob.csv'), 'spanish': file_io.read_file('data/external/spanish_u_prob.csv'), 'german': file_io.read_file('data/external/german_u_prob.csv'), 'french': file_io.read_file('data/external/french_u_prob.csv') } self.ablate = ablate