Пример #1
0
def forecast_token(text, masked_index, tokenizer, model):
    tokenized_text = ['[CLS]']
    doc = nlp(text)
    tokenized_text.extend([token.text for token in doc])
    tokenized_text.append('[SEP]')

    synonyms_ = get_candidate_tokens(tokenized_text[masked_index])
    synonyms_ = list(set(synonyms_))

    masked_token = tokenized_text[masked_index]
    token_polarity = int(Word(masked_token, language="en").polarity)  #######

    synonyms = []
    for elem in synonyms_:
        if int(Word(elem, language="en").polarity) == token_polarity:
            synonyms.append(elem)

    # Mask a token that we will try to predict back with `BertForMaskedLM`
    tokenized_text[masked_index] = '[MASK]'

    # Convert token to vocabulary indices
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])

    # Predict all tokens
    with torch.no_grad():
        outputs = model(tokens_tensor)
        predictions = outputs[0]

    token_idxs = [
        tokenizer.convert_tokens_to_ids([word])[0] for word in synonyms
    ]
    preds = np.array([predictions[0, masked_index, idx] for idx in token_idxs])
    sort_top = preds.argsort()
    #predicted_index = token_idxs[sort_top[-1]]
    candiditate_tokens = [synonyms[sort_top[-1]], synonyms[sort_top[-2]]]
    candiditate_tokens = []
    for nn in np.arange(len(preds)):
        if abs(preds[nn] - preds[sort_top[-1]]) < 0.0001:
            candiditate_tokens.append(synonyms[nn])

    if masked_token in candiditate_tokens:  # if the probability of masked token within top two, then think the masked token is correct.
        predicted_token, softmax_prob = masked_token, 100
    else:
        predicted_token, softmax_prob = synonyms[sort_top[-1]], preds[
            sort_top[-1]]

    # don't change the token if the predicted token is same at the original token
    # without consider the upper/lower case
    if masked_token.lower() == predicted_token.lower():
        predicted_token = masked_token
    return predicted_token, softmax_prob
Пример #2
0
def polyglot_stem():
    print "\nDerivational Morphemes using polyglot library"
    for w in words_derv:
        w = Word(w, language="en")
        print("{:<20}{}".format(w, w.morphemes))
    print "\nInflectional Morphemes using polyglot library"
    for w in word_infle:
        w = Word(w, language="en")
        print("{:<20}{}".format(w, w.morphemes))
    print "\nSome Morphemes examples using polyglot library"
    for w in word_infle:
        w = Word(w, language="en")
        print("{:<20}{}".format(w, w.morphemes))
Пример #3
0
def convertToPolyglotMorf(sentences, save=False):
    #List of str to List of str (morfemes)

    total_review_morf_text_list = []
    i = 1
    morfed_sentences = []
    print(len(sentences))
    for sentence in sentences:
        print(i)
        tokenized_sentence = ucto_tokenize(sentence)
        morfed_sentence = []
        for w in tokenized_sentence:
            w = str(w)
            w = Word(w, language="nl")
            #print("{:<20}{}".format(w, w.morphemes))
            morfed_sentence += w.morphemes
        #print(review_morf_list)
        morfed_sentences += morfed_sentence
        i += 1

    morfed_sentences_text = '*%'.join(morfed_sentences)

    if save is True:

        with open("TrainFiles/convertedPolyglotMorfText.txt",
                  "w") as text_file:
            text_file.write(morfed_sentences_text)

    return morfed_sentences
Пример #4
0
def get_sems(word, lang):
    print(word)
    w = Word(word.lower(), language=CODES[lang.lower()])
    try:
        res = w.neighbors
    except Exception as e:
        logger.warning(e)
        return None
    return res
Пример #5
0
    def test_polyglot1(self) :
        import polyglot
        from polyglot.text import Text, Word
     
        text = Text("Bonjour, Mesdames.")
        print("Language Detected: Code={}, Name={}\n".format(text.language.code, text.language.name))

        text = Text("第一条  机动车第三者责任保险合同(以下简称本保险合同)由保险条款、投保单、保险单、批单和特别约定共同组成。 "
                    "本保险合同争议处理适用中华人民共和国法律。")
        #print(text.entities)
        """
        print("{:<16}{}".format("Word", "POS Tag")+"\n"+"-"*30)
        for word, tag in text.pos_tags:
            print(u"{:<16}{:>2}".format(word, tag))
        """
        word = Word("Obama", language="en")
        word = Word("中华人民共和国", language="zh")
        print("Neighbors (Synonms) of {}".format(word)+"\n"+"-"*30)
        for w in word.neighbors:
            print("{:<16}".format(w))
            print("\n\nThe first 10 dimensions out the {} dimensions\n".format(word.vector.shape[0]))
            print(word.vector[:10])
def explore_parse_tree(tokens, tree_nodes):

    # for word token, obtain morphemes
    if len(tokens) == 1:
        w = Word(tokens.text, language='en')
        for morph in w.morphemes:
            if morph == tokens.text or morph == tokens.lemma_:
                continue
            tree_nodes.append((morph, 'morpheme', 1))

        # add word itself
        tree_nodes.append((tokens.text, tokens._.labels, len(tokens)))

    if tokens._.labels == () or len(tokens) == 0:
        return 0

    tree_nodes.append((tokens.text, tokens._.labels, len(tokens)))

    # constituency parsed nodes
    for child in tokens._.children:
        explore_parse_tree(child, tree_nodes)
Пример #7
0
def embeddings():
    from polyglot.text import Word
    data = dict(default_data)
    data[
        'message'] = "Neighbours (Embeddings) - Find neighbors of word API - Parameters: 'word', 'lang' language (default: en)"
    params = {}

    params['word'] = request.args.get('word')
    params['lang'] = request.args.get('lang')

    if not params:
        data['error'] = 'Missing parameters'
        return jsonify(data)

    if not params['word']:
        data['error'] = '[word] parameter not found'
        return jsonify(data)

    if not params['lang']:
        # data['error'] = '[lang] parameter not found'
        # return jsonify(data)
        params['lang'] = 'en'

    data['neighbours'] = {}

    try:
        word = Word(params['word'], language=params['lang'])
    except KeyError:
        data['error'] = 'ERROR: word not found'
        return jsonify(data)

    if not word:
        data['error'] = 'word not found'
        return jsonify(data)

    data['neighbours'] = word.neighbors

    return jsonify(data)
Пример #8
0
    def _morf(self, originalWord):

        words = []

        w = Word(originalWord, language="he")
        morp = w.morphemes

        if len(morp) == 1:
            words.append(originalWord)
            return words
        else:
            notIn = []
            for w in morp:
                if w not in morp_clean:
                    notIn.append(w)

            if len(notIn) > 1:
                words.append(originalWord)
                return words

            hasInv = False
            for w2 in notIn:
                if len(w2) == 1:
                    hasInv = True

            if hasInv:
                words.append(originalWord)
                return words
            else:
                s1 = set(morp)

                if len(s1) == len(morp):
                    words.extend(morp)
                else:
                    words.append(originalWord)

                return words
Пример #9
0
def _morf(text):

    words = []

    w = Word(text, language="he")
    morp = w.morphemes
    # x
    if len(morp) == 1:
        words.append(text)
        return words
    else:
        notIn = []
        for w in morp:
            if w not in x:
                notIn.append(w)

        if len(notIn) > 1:
            words.append(text)
            return words

        hasInv = False
        for w2 in notIn:
            if len(w2) == 1:
                hasInv = True

        if hasInv:
            words.append(text)
            return words
        else:
            s1 = set(morp)

            if len(s1) == len(morp):
                words.extend(morp)
            else:
                words.append(text)

            return words
Пример #10
0
from polyglot.text import Word

import sys

LANGUAGE = sys.argv[1]

for line in sys.stdin:
    l = []
    for w in line.split():
        m = Word(w, language=LANGUAGE).morphemes
        if len(m) == 1:
            l.append(w)
        else:
            l.append("@@ ".join(m))
    print(" ".join(l))
Пример #11
0
# full_train_text = ' '.join(FullTrainCorpusList)

with open('PosTestData.data', 'rb') as filehandle:
    # read the data as binary data stream
    PosTestCorpusList = pickle.load(filehandle)

ShorterCorpusList = PosTestCorpusList[:100]
short_text = ' '.join(ShorterCorpusList)

pos_test_text = ' '.join(PosTestCorpusList)

print(downloader.supported_languages_table("morph2"))

words = ["preprocessing", "processor", "invaluable", "thankful", "crossed"]
for w in words:
    w = Word(w, language="en")
    print("{:<20}{}".format(w, w.morphemes))

# train_data = list(pos_test_text)
#
# io = morfessor.MorfessorIO()
#
# #train_data = list(io.read_corpus_file('training_data'))
#
# model = morfessor.BaselineModel()
#
# #model.load_data(train_data, count_modifier=lambda x: 1)
# #def log_func(x):
# #    return int(round(math.log(x + 1, 2)))
# #model_logtokens.load_data(train_data, count_modifier=log_func)
# model.load_data(train_data)
Пример #12
0
def read_extra_features(split_dir,
                        normal_wiki_ngram_2=None,
                        normal_wiki_ngram_3=None,
                        simple_wiki_ngram_2=None,
                        simple_wiki_ngram_3=None,
                        cbt_corpus_ngram_2=None,
                        cbt_corpus_ngram_3=None,
                        normal_wiki=None,
                        simple_wiki=None,
                        lexicon_dir=None,
                        brown_dict=None,
                        lang_8_corpus=None,
                        tatoeba=None,
                        cbt_corpus=None,
                        nlp=None):
    if 'tsv' in split_dir:
        data = pd.read_csv(split_dir, sep='\t', quoting=csv.QUOTE_NONE)
    elif 'xlsx' in split_dir:
        data = pd.read_excel(split_dir)

    data.rename(columns={'subcorpus': 'corpus'}, inplace=True)
    data.token.fillna('null', inplace=True)

    print('Generating dependencies corpus')
    data['pos_label'] = data.apply(
        lambda x: get_meta(x.sentence, x.token, nlp, 'pos'), axis=1)
    data['sentence_pre'] = data.apply(
        lambda x: get_meta(x.sentence, x.token, nlp, 'text'), axis=1)
    #data['entities_sent'] = data.apply(lambda x: get_meta(x.sentence, x.token, nlp, 'ent'), axis=1)
    data['dep_target'] = data.apply(
        lambda x: get_meta(x.sentence, x.token, nlp, 'dep'), axis=1)

    extra_features = []

    len_lang_8_words = len(lang_8_corpus.split(' '))
    len_tatoeba = len(tatoeba.split(' '))
    len_cbt = len(cbt_corpus.split(' '))
    len_normal_wiki = len(normal_wiki.split(' '))
    len_simple_wiki = len(simple_wiki.split(' '))

    print('Generating auxiliar complexity')
    extra_lexicon = pd.read_csv(lexicon_dir,
                                sep='\t',
                                names=['token', 'complex_aux'])
    extra_lexicon['token_l'] = extra_lexicon['token'].str.lower()
    data['token_l'] = data['token'].str.lower()
    data_merged = pd.merge(data,
                           extra_lexicon[['token_l', 'complex_aux']],
                           on='token_l',
                           how='left')

    def find_position(row):
        try:
            if ' ' in row.token:
                ix = row.sentence_pre.index(row.token.split(' '))
                return [ix, ix + 1]
            else:
                return [row.sentence_pre.index(row.token)]
        except:
            if ' ' in row.token:
                token_find = row.token.split(' ')[0]
            else:
                token_find = row.token

            for ix, w in enumerate(row.sentence_pre):
                if token_find in w:
                    if ' ' in row.token:
                        return [ix, ix + 1]
                    else:
                        return [ix]
            if ' ' in row.token:
                token_find = row.token_l.split(' ')[0]
            else:
                token_find = row.token_l

            for ix, w in enumerate(row.sentence_pre):
                if token_find in w:
                    if ' ' in row.token_l:
                        return [ix, ix + 1]
                    else:
                        return [ix]

    print('Counting ...')

    data_merged['position'] = data_merged.parallel_apply(
        lambda x: find_position(x), axis=1)
    data_merged['pos_tag'] = data_merged.apply(
        lambda x: x.pos_label[x.position][0], axis=1)
    #data_merged['entity'] = data_merged.apply(lambda x: x.entities_sent[x.position], axis=1)
    data_merged['len_sentence'] = data_merged.parallel_apply(
        lambda x: len(x.sentence), axis=1)

    data_merged['len_token'] = data_merged.parallel_apply(
        lambda x: len(x.token), axis=1)

    data_merged['count_senses'] = data_merged.apply(
        lambda x: sum([len(wn.synsets(w)) for w in x.token.split(' ')]),
        axis=1)
    data_merged['count_tags'] = data_merged.parallel_apply(
        lambda x: sum([len(brown_dict[w.lower()])
                       for w in x.token.split(' ')]),
        axis=1)
    data_merged['count_syllables'] = data_merged.parallel_apply(
        lambda x: syllables.estimate(x.token), axis=1)
    data_merged['count_morphemes'] = data_merged.parallel_apply(lambda x: sum(
        [len(Word(w, language='en').morphemes) for w in x.token.split(' ')]),
                                                                axis=1)

    print('Counting ...')
    data_merged['count_after'] = data_merged.parallel_apply(
        lambda x: len(x.sentence.partition(x.token)[2].split(' ')), axis=1)
    data_merged['count_before'] = data_merged.parallel_apply(
        lambda x: len(x.sentence.partition(x.token)[0].split(' ')), axis=1)

    def get_features_from_corpus(row):

        count_lang_8 = lang_8_corpus.count(row.token)
        count_tatoeba = tatoeba.count(row.token)
        count_cbt = cbt_corpus.count(row.token)
        count_normal_wiki = normal_wiki.count(row.token)
        count_simple_wiki = simple_wiki.count(row.token)

        return pd.Series([
            count_lang_8, count_lang_8 / len_lang_8_words, count_tatoeba,
            count_tatoeba / len_tatoeba, count_cbt, count_cbt / len_cbt,
            count_normal_wiki, count_normal_wiki / len_normal_wiki,
            count_simple_wiki, count_simple_wiki / len_simple_wiki
        ])

    print('Generating features from corpus ...')
    data_merged[[
        'count_lang_8', 'freq_lang_8', 'count_tatoeba', 'freq_tatoeba',
        'count_cbt', 'freq_cbt', 'count_normal_wiki', 'freq_normal_wiki',
        'count_single_wiki', 'freq_single_wiki'
    ]] = data_merged.parallel_apply(lambda x: get_features_from_corpus(x),
                                    axis=1)

    data_merged['count_dep'] = data_merged.parallel_apply(
        lambda x: Counter(x.dep_target)[x.token], axis=1)
    data_merged['count_words'] = data_merged.parallel_apply(
        lambda x: x.token.count(' '), axis=1)

    def get_tags_features(row):
        lim_aux = row.position[0] - 8
        if len(row.position) > 1:
            lim_sup = row.position[1]
        else:
            lim_sup = row.position[0]

        sentence_pre = ' '.join(
            row.sentence_pre[(0 if lim_aux < 0 else lim_aux):(lim_sup + 7)])
        tags_cut_c = Counter(
            row.pos_label[(0 if lim_aux < 0 else lim_aux):(lim_sup + 5)])
        count_nouns = tags_cut_c['NOUN'] if 'NOUN' in tags_cut_c else 0
        count_verbs = tags_cut_c['VERB'] if 'VERB' in tags_cut_c else 0
        ratio = (count_nouns /
                 count_verbs) if count_nouns != 0 and count_verbs != 0 else 0

        return pd.Series([
            ratio, tags_cut_c['PROPN'] if 'PROPN' in tags_cut_c else 0,
            count_nouns, tags_cut_c['ADV'] if 'ADV' in tags_cut_c else 0,
            count_verbs, tags_cut_c['PART'] if 'PART' in tags_cut_c else 0
        ])

    print('Generating tags features ...')
    data_merged[[
        'ratio', 'count_propn', 'count_noun', 'count_adv', 'count_verb',
        'count_part'
    ]] = data_merged.parallel_apply(lambda x: get_tags_features(x), axis=1)

    def get_ngram_features(row):
        if len(row.position) > 1:
            pos_after = row.position[1]
        else:
            pos_after = row.position[0]

        pos_before = row.position[0]

        if pos_after + 1 < len(row.sentence_pre):
            tuple_after = (row.sentence_pre[pos_after],
                           row.sentence_pre[pos_after + 1])
        else:
            tuple_after = (row.sentence_pre[pos_after], '.')

        if pos_before - 1 >= 0:
            tuple_before = (row.sentence_pre[pos_before - 1],
                            row.sentence_pre[pos_before])
        else:
            tuple_before = ('.', row.sentence_pre[pos_before])

        aux_features = []
        aux_features.append(normal_wiki_ngram_2[tuple_after])
        aux_features.append(simple_wiki_ngram_2[tuple_after])
        aux_features.append(cbt_corpus_ngram_2[tuple_after])
        aux_features.append(normal_wiki_ngram_2[tuple_before])
        aux_features.append(simple_wiki_ngram_2[tuple_before])
        aux_features.append(cbt_corpus_ngram_2[tuple_before])

        aux_features.append(normal_wiki_ngram_3[tuple_after])
        aux_features.append(simple_wiki_ngram_3[tuple_after])
        aux_features.append(cbt_corpus_ngram_3[tuple_after])
        aux_features.append(normal_wiki_ngram_3[tuple_before])
        aux_features.append(simple_wiki_ngram_3[tuple_before])
        aux_features.append(cbt_corpus_ngram_3[tuple_before])

        return pd.Series(aux_features)

    print('Generating ngram features ...')
    data_merged[[
        'count_ngram_2_simple_wiki_after', 'count_ngram_2_normal_wiki_after',
        'count_ngram_2_cbt_corpus_after', 'count_ngram_2_simple_wiki_before',
        'count_ngram_2_normal_wiki_before', 'count_ngram_2_cbt_corpus_before',
        'count_ngram_3_simple_wiki_after', 'count_ngram_3_normal_wiki_after',
        'count_ngram_3_cbt_corpus_after', 'count_ngram_3_simple_wiki_before',
        'count_ngram_3_normal_wiki_before', 'count_ngram_3_cbt_corpus_before'
    ]] = data_merged.apply(lambda x: get_ngram_features(x), axis=1)

    return data_merged.drop(['sentence', 'token', 'token_l'], axis=1)
Пример #13
0
print("amount of adjectives")
print(len(a))
print("most common verbs")
b = Counter(verbs)
print(b.most_common(20))
print("amount of verbs")
print(len(b))

#init lists for adjective types:
positive_adjectives = []
neutral_adjectives = []
negative_adjectives = []

#find out adjective sentiment and append to lists accordingly
for adjective in adjectives:
    w = Word(adjective, language="fi")
    if (w.polarity == -1):
        negative_adjectives.append(w)
    elif (w.polarity == 1):
        positive_adjectives.append(w)
    else:
        neutral_adjectives.append(w)
print("amount of positive adjectives")
print(len(positive_adjectives))
print("amount of negative adjectives")
print(len(negative_adjectives))
print("amount of neutral adjectives")
print(len(neutral_adjectives))

pos_a = Counter(positive_adjectives)
neg_a = Counter(negative_adjectives)
Пример #14
0
def getIndonesianMorphs(word):
    w = Word(word, language="id")
    return w.morphemes
Пример #15
0
# ## Named Entity Recognition

text = Text(u"In Großbritannien war Gandhi mit dem westlichen Lebensstil vertraut geworden")
print(text.entities)


# ## Polarity

print("{:<16}{}".format("Word", "Polarity")+"\n"+"-"*30)
for w in zen.words[:6]:
    print("{:<16}{:>2}".format(w, w.polarity))


# ## Embeddings

word = Word("Obama", language="en")
print("Neighbors (Synonms) of {}".format(word)+"\n"+"-"*30)
for w in word.neighbors:
    print("{:<16}".format(w))
print("\n\nThe first 10 dimensions out the {} dimensions\n".format(word.vector.shape[0]))
print(word.vector[:10])


# ## Morphology

word = Text("Preprocessing is an essential step.").words[0]
print(word.morphemes)


# ## Transliteration
Пример #16
0
from polyglot.text import Word, Text

words = "həmişə bütün hüquq normalarda hər üç element olmur".split(" ")

for w in words:
    w = Word(w, language="az")
    print("{:<20}{}".format(w, w.morphemes))
"""

həmişə              ['həmişə']
bütün               ['bütün']
hüquq               ['hüquq']
normalarda          ['norma', 'larda']
hər                 ['hər']
üç                  ['üç']
element             ['element']
olmur               ['olmur']

"""

text = "həmişəbütünhüquqnormalardahərüçelementolmur"

splitted_text = Text(text)
splitted_text.language = "az"
print(splitted_text.morphemes)
"""

['həmişə', 'bütün', 'hüquq', 'norma', 'larda', 'hər', 'üç', 'element', 'olmur']

"""
Пример #17
0
def morf(w):
    return Word(w, language=LANGUAGE).morphemes
def preprocess_data(sentence_lists, pre_defined, meta, lang, dataset_type,
                    missing_embed_to_zeros):
    # second pass:
    embeds = []
    max_len = meta['maxlen']
    count = 0
    too_long = 0
    errors = 0
    no_embedding = 0
    strange_id = 0
    for sentences in sentence_lists:
        for sentence in sentences:
            count += 1
            if len(sentence) > max_len - 1:  # first token will denote language
                # print(sentence)
                too_long += 1
                continue
            try:
                # first token is the root token for the dependency tree, also encodes the language
                this_sentence = {
                    'token': [pre_defined[lang]],
                    'head': [pre_defined[lang]],
                    'upos': [pre_defined[lang]],
                    'deprel': [pre_defined[lang]]
                }

                #go through the sentence, discarding all the tokens with composite id
                tokens = []
                for tok in sentence:
                    if tok.head is None:
                        continue
                    try:
                        int(tok.id)  # check if that fails
                        tokens.append(tok)
                    except:
                        continue

                for t, token in enumerate(tokens):
                    try:
                        assert int(
                            token.id
                        ) == t + 1, "token.id must equal t+1, instead got " + token.id + ", t=" + t
                        assert int(token.head) <= len(sentence)
                    except:
                        strange_id += 1
                        raise ValueError("strange id")

                    word = Word(token.form, language=lang)
                    try:
                        word_vector = word.vector
                    except:
                        no_embedding += 1
                        if missing_embed_to_zeros:
                            word_vector = np.zeros(256, dtype=np.float32)
                        else:
                            raise ValueError("no embedding")

                    if 'embed' not in this_sentence:
                        this_sentence['embed'] = [np.zeros_like(word_vector)]
                    this_sentence['embed'].append(word_vector)
                    this_sentence['token'].append(
                        meta['emb_index'][lang][token.lemma])
                    this_sentence['head'].append(int(token.head))
                    this_sentence['upos'].append(meta['upos'][token.upos])
                    this_sentence['deprel'].append(
                        meta['deprel'][token.deprel])

                this_sentence_nice = {
                    key: torch.tensor(pad(val, max_len))
                    for key, val in this_sentence.items() if key != 'embed'
                }
                pad_embed = pad(this_sentence['embed'], max_len,
                                np.zeros_like(this_sentence['embed'][0]))
                pad_embed_nice = torch.from_numpy(np.array(pad_embed))
                this_sentence_nice['embed'] = pad_embed_nice
                embeds.append(this_sentence_nice)

            except ValueError as e:
                errors += 1
                continue
    if count > 0:
        print('kept ', len(embeds) / count, ' of all sentences')
    else:
        print("no valid sentences at all - what's going on here?")

    print('total', count, ', too long', too_long, ', no_embedding',
          no_embedding, ', strange ids', strange_id, ', total errors', errors)
    meta['stats'][lang][dataset_type] = OrderedDict()
    meta['stats'][lang][dataset_type]['orig_size'] = count
    meta['stats'][lang][dataset_type]['size'] = len(embeds)
    return embeds
Пример #19
0
def get_morphemes(word):
    """
    Returns a list of morphemes of the given word.
    """
    return ' '.join([str(m) for m in Word(word, language='en').morphemes])
Пример #20
0
def stem4(word):
    w = Word(word, language="tr")
    return w.morphemes[0]