def get_avg_vector(word, embedding_words): if " " in word: single_words = word.split(" ") list_vector = [] for single_word in single_words: word_vec = find_word(single_word, embedding_words) if word_vec: list_vector.append(word_vec.vector) else: # Try again with lowercase single_word = single_word.lower() word_vec = find_word(single_word, embedding_words) if word_vec: list_vector.append(word_vec.vector) # print("list_vector: ", list_vector) # input(">>>>>>>>") returned_Word = Word(word, vectors.mean_list(list_vector), 1) else: returned_Word = find_word(word, embedding_words) # print("Avg returned vector = ", returned_vector) # input(">>>>") return returned_Word
def get_random_wordpairs(question_count=10, ): ''' returns a list of word objects reading from the database''' # create a session with defaults session = get_session_with_defaults() wordpair_list = [] # where(gte('frequency',1)) message="Cannot execute this query as it might involve data filtering and thus may have unpredictable performance. If you want to execute this query despite the performance unpredictability, use ALLOW FILTERING" # statement_select_words = (QueryBuilder.select_from("tbl_deutsch").columns('german_word', 'english_word').limit(question_count)) # random_uniqueId = uuid.uuid4() # Random #To get random set of words ticks = time.time() rand_time = lambda: float(random.randrange(0, 10000) + ticks) random_uniqueId = uuid.uuid5(uuid.NAMESPACE_DNS, str(rand_time())) statement_select_words = "SELECT german_word,english_word FROM tbl_deutsch WHERE id>%s LIMIT %s ALLOW FILTERING" future = session.execute_async(statement_select_words, [random_uniqueId, question_count]) try: rows = future.result() for row in rows: current_word = Word(row.german_word, row.english_word) wordpair_list.append(current_word) except ReadTimeout: log.exception("Query timed out:") return wordpair_list
def __init__(self, words: List[Word], attribute_to_values_dict: AttributeToValuesDict): """ A standard reader. This class should not be instantiated directly. Instead, use UDTreebankReader. The reader constructor takes a word list and a dict of attributes and all the values they can take. It will ensure that examples for attribute-values that aren't in the dict are discarded. """ self._unimorph_attributes_to_values_dict: AttributeToValuesDict = attribute_to_values_dict # Discard invalid attribute-values modified_words: List[Word] = [] for w in words: modified_attr_vals: Dict[str, str] = {} for attr in w.get_attributes(): if attr not in self._unimorph_attributes_to_values_dict: # Untracked attribute, so skip it continue val = w.get_attribute(attr) if val not in self._unimorph_attributes_to_values_dict[attr]: # Untracked value, so skip it continue modified_attr_vals[attr] = val # Create modified word modified_words.append( Word(w.get_word(), w.get_embedding(), w.get_count(), modified_attr_vals)) self._words: List[Word] = modified_words self._cache: Dict[str, Dict[str, Any]] = {}
def get_random_wordpairs(database, table, question_count): ''' returns a list of random word objects''' wordpair_list = [] rows_tuple = select_random_questions(database, table, question_count) for row in rows_tuple: current_word = Word(row[0], row[1]) wordpair_list.append(current_word) return wordpair_list
def parse_line(line: str, frequency: int) -> Word: # print("Line=", line) tokens = line.split(" ") word = tokens[0] if emb_config.do_normalize_emb: vector = v.normalize(np.array([float(x) for x in tokens[1:]])) else: vector = np.array([float(x) for x in tokens[1:]]) return Word(word, vector, frequency)
def get_selected_wordpairs(database, table, question_count, sql_select_statement): ''' returns a list of word objects based on a select..where statement ''' wordpair_list = [] # sql_select_query = "SELECT german_word,english_word FROM verb where german_word LIKE 'ver%'" rows_tuple = execute_generic_query(database, table, sql_select_statement) for row in rows_tuple: current_word = Word(row[0], row[1]) wordpair_list.append(current_word) return wordpair_list
def pad_sentences(wordlist, padding=20): padded_wordlist = [] padding_token = Word() padding_token.set_as_pad_token() for s in wordlist: padded = [padding_token] * padding if len(s) > padding: padded = s[:padding] else: padded[padding-len(s):] = s padded_wordlist.append(padded) return padded_wordlist
def __init__(self, verse, num_verse): """Create an objet type Verse.""" super().__init__(verse) self.num_verse = num_verse self.words = self.splitwords() self.__e_posttonique() # self.__ents_posttonique('es', 'ës') # self.__ents_posttonique('ent', 'ënt') self.words = [Word(w) for w in self.words] self.__e_last() self.update() self.cesure = set() if gv.METRICS > 1: self.__set_dict_syll() self.__find_cesure() self.str_cesure = '/'.join(sorted(self.cesure))
def read(cls, paths: List[str]) -> List[Word]: """ Should be overriden with the logic to (i) read all words in the dataset and (ii) discover the values each unimorph attribute can take and place them in self._unimorph_attributes_to_values_dict. """ raw_words: List[Dict[str, Any]] = [] for path in paths: with open(path, "rb") as h: raw_words.extend(pickle.load(h)) # Read all words and store them in self._words words = [] for item in raw_words: words.append( Word(item["word"], item["embedding"].reshape(-1), 1, item["attributes"])) return words
for w in words: canonical = ignore_char_regex.sub("", w.text) if not canonical in seen_words: seen_words.add(canonical) # Keep the original ordering unique_words.append(w) return unique_words def remove_stop_words(words: List[Word]) -> List[Word]: return [w for w in words if ( len(w.text) > 1 and is_valid_word.match(w.text))] # Run "smoke tests" on import assert [w.text for w in remove_stop_words([ Word('a', [], 1), Word('ab', [], 1), Word('-ab', [], 1), Word('ab_', [], 1), Word('a.', [], 1), Word('.a', [], 1), Word('ab', [], 1), ])] == ['ab', 'ab'] assert [w.text for w in remove_duplicates([ Word('a.b', [], 1), Word('-a-b', [], 1), Word('ab_+', [], 1), Word('.abc...', [], 1), ])] == ['a.b', '.abc...']