예제 #1
0
def prepareSentence2(sentence):
    sentenceParseResult = parse_text(sentence)

    sentenceLemmatized = lemmatize(sentenceParseResult)

    sentencePosTagged = posTag(sentenceParseResult)

    sentenceLemmasAndPosTags = []

    for i in range(len(sentenceLemmatized)):
        sentenceLemmasAndPosTags.append([])

    for i in range(len(sentenceLemmatized)):
        for item in sentenceLemmatized[i]:
            sentenceLemmasAndPosTags[i].append(item)
        sentenceLemmasAndPosTags[i].append(sentencePosTagged[i][3])

    words = []

    for rawWord in sentenceLemmasAndPosTags:
        word = Word(rawWord[1] - 1, rawWord[2])
        word.lemma = rawWord[3]
        word.pos = rawWord[4]
        words.append(word)

    return words
예제 #2
0
 def _get_words(raw_sentence):
     words = []
     for i, item in enumerate(raw_sentence['words']):
         word = Word(i + 1, item[0])
         word.lemma = item[1]['Lemma']
         word.pos = item[1]['PartOfSpeech'].lower()
         word.ner = item[1]['NamedEntityTag']
         words.append(word)
     return words
예제 #3
0
def pad_sentences(wordlist, padding=20):
    padded_wordlist = []
    padding_token = Word()
    padding_token.set_as_pad_token()
    for s in wordlist:
        padded = [padding_token] * padding
        if len(s) > padding:
            padded = s[:padding]
        else:
            padded[padding-len(s):] = s
        padded_wordlist.append(padded)
    return padded_wordlist
예제 #4
0
def get_avg_vector(word, embedding_words):

    if " " in word:
        single_words = word.split(" ")
        list_vector = []

        for single_word in single_words:
            word_vec = find_word(single_word, embedding_words)
            if word_vec:
                list_vector.append(word_vec.vector)
            else:
                # Try again with lowercase
                single_word = single_word.lower()
                word_vec = find_word(single_word, embedding_words)
                if word_vec:
                    list_vector.append(word_vec.vector)

        # print("list_vector: ", list_vector)
        # input(">>>>>>>>")

        returned_Word = Word(word, vectors.mean_list(list_vector), 1)
    else:
        returned_Word = find_word(word, embedding_words)

    # print("Avg returned vector = ", returned_vector)
    # input(">>>>")

    return returned_Word
예제 #5
0
    def __init__(self, words: List[Word],
                 attribute_to_values_dict: AttributeToValuesDict):
        """
        A standard reader. This class should not be instantiated directly. Instead, use UDTreebankReader.

        The reader constructor takes a word list and a dict of attributes and all the values they can take.
        It will ensure that examples for attribute-values that aren't in the dict are discarded.
        """
        self._unimorph_attributes_to_values_dict: AttributeToValuesDict = attribute_to_values_dict

        # Discard invalid attribute-values
        modified_words: List[Word] = []
        for w in words:
            modified_attr_vals: Dict[str, str] = {}
            for attr in w.get_attributes():
                if attr not in self._unimorph_attributes_to_values_dict:
                    # Untracked attribute, so skip it
                    continue

                val = w.get_attribute(attr)
                if val not in self._unimorph_attributes_to_values_dict[attr]:
                    # Untracked value, so skip it
                    continue

                modified_attr_vals[attr] = val

            # Create modified word
            modified_words.append(
                Word(w.get_word(), w.get_embedding(), w.get_count(),
                     modified_attr_vals))

        self._words: List[Word] = modified_words

        self._cache: Dict[str, Dict[str, Any]] = {}
예제 #6
0
def get_random_wordpairs(question_count=10, ):
    ''' returns a list of word objects reading from the database'''

    # create a session with defaults
    session = get_session_with_defaults()

    wordpair_list = []
    # where(gte('frequency',1)) message="Cannot execute this query as it might involve data filtering and thus may have unpredictable performance. If you want to execute this query despite the performance unpredictability, use ALLOW FILTERING"
    # statement_select_words = (QueryBuilder.select_from("tbl_deutsch").columns('german_word', 'english_word').limit(question_count))

    # random_uniqueId = uuid.uuid4() # Random
    #To get random set of words
    ticks = time.time()
    rand_time = lambda: float(random.randrange(0, 10000) + ticks)
    random_uniqueId = uuid.uuid5(uuid.NAMESPACE_DNS, str(rand_time()))

    statement_select_words = "SELECT german_word,english_word FROM tbl_deutsch WHERE id>%s LIMIT %s ALLOW FILTERING"
    future = session.execute_async(statement_select_words,
                                   [random_uniqueId, question_count])

    try:
        rows = future.result()
        for row in rows:
            current_word = Word(row.german_word, row.english_word)
            wordpair_list.append(current_word)
    except ReadTimeout:
        log.exception("Query timed out:")

    return wordpair_list
예제 #7
0
def get_random_wordpairs(database, table, question_count):
    ''' returns a list of random word objects'''
    wordpair_list = []
    rows_tuple = select_random_questions(database, table, question_count)
    for row in rows_tuple:
        current_word = Word(row[0], row[1])
        wordpair_list.append(current_word)
    return wordpair_list
예제 #8
0
 def parse_line(line: str, frequency: int) -> Word:
     # print("Line=", line)
     tokens = line.split(" ")
     word = tokens[0]
     if emb_config.do_normalize_emb:
         vector = v.normalize(np.array([float(x) for x in tokens[1:]]))
     else:
         vector = np.array([float(x) for x in tokens[1:]])
     return Word(word, vector, frequency)
예제 #9
0
def get_selected_wordpairs(database, table, question_count,
                           sql_select_statement):
    ''' returns a list of word objects based on a select..where statement '''
    wordpair_list = []
    # sql_select_query = "SELECT german_word,english_word FROM verb where german_word LIKE 'ver%'"
    rows_tuple = execute_generic_query(database, table, sql_select_statement)
    for row in rows_tuple:
        current_word = Word(row[0], row[1])
        wordpair_list.append(current_word)
    return wordpair_list
예제 #10
0
    def load(path_input):
        sentences = []
        sentence = []
        with codecs.open(path_input,'r', 'utf8') as f:
            lines = f.readlines()

        for line in lines:
            if line == '\n':
                sentences.append(sentence)
                sentence = []
                continue

            parts = line.strip().split('\t')
            word = Word(parts[0], parts[1]) # punctuation head is root
            word.lemma = parts[2]
            word.pos = parts[4]
            word.dep = parts[7]
            word.head = parts[6]
            sentence.append(word)

        sentences.append(sentence)
        return sentences
예제 #11
0
 def __init__(self, verse, num_verse):
     """Create an objet type Verse."""
     super().__init__(verse)
     self.num_verse = num_verse
     self.words = self.splitwords()
     self.__e_posttonique()
     # self.__ents_posttonique('es', 'ës')
     # self.__ents_posttonique('ent', 'ënt')
     self.words = [Word(w) for w in self.words]
     self.__e_last()
     self.update()
     self.cesure = set()
     if gv.METRICS > 1:
         self.__set_dict_syll()
         self.__find_cesure()
     self.str_cesure = '/'.join(sorted(self.cesure))
예제 #12
0
    def read(cls, paths: List[str]) -> List[Word]:
        """
        Should be overriden with the logic to (i) read all words in the dataset and (ii) discover
        the values each unimorph attribute can take and place them in
        self._unimorph_attributes_to_values_dict.
        """
        raw_words: List[Dict[str, Any]] = []
        for path in paths:
            with open(path, "rb") as h:
                raw_words.extend(pickle.load(h))

        # Read all words and store them in self._words
        words = []
        for item in raw_words:
            words.append(
                Word(item["word"], item["embedding"].reshape(-1), 1,
                     item["attributes"]))

        return words
예제 #13
0
    for w in words:
        canonical = ignore_char_regex.sub("", w.text)
        if not canonical in seen_words:
            seen_words.add(canonical)
            # Keep the original ordering
            unique_words.append(w)
    return unique_words


def remove_stop_words(words: List[Word]) -> List[Word]:
    return [w for w in words if (
            len(w.text) > 1 and is_valid_word.match(w.text))]


# Run "smoke tests" on import
assert [w.text for w in remove_stop_words([
    Word('a', [], 1),
    Word('ab', [], 1),
    Word('-ab', [], 1),
    Word('ab_', [], 1),
    Word('a.', [], 1),
    Word('.a', [], 1),
    Word('ab', [], 1),
])] == ['ab', 'ab']
assert [w.text for w in remove_duplicates([
    Word('a.b', [], 1),
    Word('-a-b', [], 1),
    Word('ab_+', [], 1),
    Word('.abc...', [], 1),
])] == ['a.b', '.abc...']