示例#1
0
def delete_adj_adv(text):
    tokenized = sent_tokenize(text)
    good_words = []
    old = len(text.split(' '))

    for index in tokenized:
        words_list = nltk.word_tokenize(index)

        tagged = Tagger(
            language='ro'
        )  # Using a Tagger. Which is part-of-speech  tagger or POS-tagger.

        for words in words_list:
            typed = tagged.tag(words)
            if typed[0][1] != 'ADJ' and typed[0][1] != 'ADV':

                good_words.append(typed[0][0])
            else:
                last_word = good_words[len(good_words) - 1]
                if get_type(last_word)[0][1] == "ADP" or get_type(
                        last_word)[0][1] == "DET":
                    good_words.append(typed[0][0])

    new_text = ""
    for words in good_words:
        if words in ".,?!":
            new_text = new_text + words
        else:
            new_text = new_text + " " + words

    return new_text, old - len(new_text.split(' '))
示例#2
0
def get_stem(tokens):
    """
    Gets the root of words by tagging them syntactically
    and then using WordNet lemmatizer.

    @param tokens: list containing morphologically rich words
    @return: A list of words stripped to their root 
    """

    tagger = Tagger(language="en")  #creates Ripple Tagger
    tagged = tagger.tag(tokens)  #creates list of tokens with tags
    lemmat = WordNetLemmatizer()  #creates instance of lemmatizer
    stemmed = []
    for token, tag in tagged:

        wordnettag = get_tag(tag)
        if (get_tag(tag) == 'OTHER'
            ):  #words with unknown tags are added as they are
            stemmed.append([token, tag])
        else:
            stem = lemmat.lemmatize(token, wordnettag)
            stemmed.append([stem, wordnettag])

    return stemmed


#if __name__ == "__main__":
#getStem()
示例#3
0
def text_separator():

    text_dict = {"paragraph": [], "sentences": [], "analyses": []}

    tagger = Tagger(language="ro")
    text_support = "Karl Benz dar nici așa Cristi , Marea Neagra nu fu mulțumit Motorwagen. Se duse în curte și aduse un băț cu care o bătu zdravăn, iar biata " \
                   "fată plângea de se scutura cămașa pe ea. - Unchiule, unchiule, țipa ea, cu ce sunt eu de vină că " \
                   "lupul a dat iama în oi? Dar bărbatul cel crud nu cunoștea mila. - Să pleci din casa mea, strigă " \
                   "el în cele din urmă, ostenit de atâta bătaie, că nici nu putea să mai sufle. Să te duci de aici " \
                   "și să nu te mai întorci până ce nu-mi aduci oile înapoi!"

    mrep = lambda s, d: s if not d else mrep(s.replace(*d.popitem()), d)

    text_support = re.sub(r'\s+', ' ', text_support)
    for sign in ['.', '!', '?', '...', '[...]', ':', "'", ',', '"']:
        text_support = str.replace(text_support, sign, " " + sign)

    # separa propozitiile in functie de semnele de punctuatie
    text_support = mrep(text_support, dict_separator).split("###")[:-1]

    text_support = [
        sentence[1:] if sentence[0] == " " else sentence
        for iterable, sentence in enumerate(text_support)
    ]

    for sentence in text_support:
        text_dict["sentences"].append(sentence)
        text_dict["analyses"].append(tagger.tag(sentence))

    print(text_dict)
 def test_swedish(self):
     tagger = Tagger(language="swedish")
     self.assertEqual(
         tagger.tag(u"Fördomen har alltid sin rot i vardagslivet"), [
             (u'Fördomen', 'NOUN'),
             (u'har', 'VERB'),
             (u'alltid', 'ADV'),
             (u'sin', 'DET'),
             (u'rot', 'NOUN'),
             (u'i', 'ADP'),
             (u'vardagslivet', 'NOUN'),
         ])
 def test_swedish_alternative(self):
     tagger = Tagger(language="swedish-2")
     self.assertEqual(
         tagger.tag(u"Fördomen har alltid sin rot i vardagslivet"),
         [
             (u'Fördomen', 'NOUN'),
             (u'har', 'AUX'),  # Wrong, but predicted using swedish-2
             (u'alltid', 'ADV'),
             (u'sin', 'PRON'),  # Wrong, but predicted using swedish-2
             (u'rot', 'NOUN'),
             (u'i', 'ADP'),
             (u'vardagslivet', 'NOUN'),
         ])
示例#6
0
def words_score(text):
    original = text
    separatori = [",", '.', '!', '?']
    tokenized = sent_tokenize(text)
    top_sentences = {}
    dictionary = {}

    for i in tokenized:

        for index in separatori:
            i = i.replace(index, '')

        words_list = nltk.word_tokenize(i)

        tagged = Tagger(language='ro')
        for words in words_list:
            type = tagged.tag(words)

            if type[0][0] not in dictionary.keys():
                if type[0][1] == 'PROPN':
                    dictionary[type[0][0]] = 2
                elif type[0][1] == 'VERB':
                    dictionary[type[0][0]] = 1
                elif type[0][1] == 'NOUN':
                    dictionary[type[0][0]] = 0.5
                else:
                    dictionary[type[0][0]] = 0.1
            else:
                if type[0][1] == 'PROPN':
                    dictionary[type[0][0]] += 2
                elif type[0][1] == 'VERB':
                    dictionary[type[0][0]] += 1
                elif type[0][1] == 'NOUN':
                    dictionary[type[0][0]] += 0.5
                else:
                    dictionary[type[0][0]] += 0.1

    for i in tokenized:
        copy = i
        for index in separatori:
            i = i.replace(index, '')

        words_list = nltk.word_tokenize(i)
        suma = 0
        for words in words_list:
            if words in dictionary.keys():
                suma = suma + dictionary[words]
        top_sentences[copy] = suma

    return top_sentences
 def test_english(self):
     tagger = Tagger(language="en")
     self.assertEqual(
         tagger.tag(u"The quick brown fox jumps over the lazy dog ."), [
             (u'The', u'DET'),
             (u'quick', u'ADJ'),
             (u'brown', u'ADJ'),
             (u'fox', u'NOUN'),
             (u'jumps', u'VERB'),
             (u'over', u'ADP'),
             (u'the', u'DET'),
             (u'lazy', u'ADJ'),
             (u'dog', u'NOUN'),
             (u'.', u'PUNCT'),
         ])
 def test_french(self):
     tagger = Tagger(language="fra-1")
     self.assertEqual(
         tagger.tag(
             u"Cette annonce a fait l' effet d' une véritable bombe ."), [
                 (u'Cette', 'DET'),
                 (u'annonce', 'NOUN'),
                 (u'a', 'AUX'),
                 (u'fait', 'VERB'),
                 (u"l'", 'DET'),
                 (u'effet', 'NOUN'),
                 (u"d'", 'ADP'),
                 (u'une', 'DET'),
                 (u'véritable', 'ADJ'),
                 (u'bombe', 'NOUN'),
                 (u'.', 'PUNCT'),
             ])
示例#9
0
def posTagging():
    extractAbout()
    client = MongoClient()
    db = client.usersbot.testRecommendation
    tagger = Tagger(language="it")
    count=0
    for r in db.find():
        count=count+1
        print("ristorante"+str(count))
        id = r['_id']
        tagADJ = []
        tagNOUN = []
        for el in tagger.tag(r['about']):
            if el[1] == 'ADJ':
                tagADJ.append(el[0])
            if el[1] == 'NOUN':
                tagNOUN.append(el[0])
        if len(tagADJ)>0 or len(tagNOUN)>0:
            db.update_one({"_id":id},{"$set": {"tagADJ": tagADJ,"tagNOUN":tagNOUN}})
        else:
            db.remove({'_id':id})
    client.close()
 def __init__(self, doc_id, language, paragraph_list, doc_text):
     self.doc_id = doc_id
     self.language = language
     self.para_ids = paragraph_list
     self.doc_text = unicode(doc_text)
     self.word_tokens = re.findall(r'[\w\']+|[.,!?;]', self.doc_text)
     print 'Initialize class and tagging the content, language: {}'.format(
         self.language)
     self.paragraphes = [
         self.word_tokens[x:x + 500]
         for x in xrange(0, len(self.word_tokens), 500)
     ]
     self.tagged_content = []
     # tag each paragraph and add it to the tagged content
     for paragraph in self.paragraphes:
         self.tagged_content.append(
             Tagger(language=language).tag(' '.join(paragraph)))
示例#11
0
def getTagWords():
    client = MongoClient()
    db1 = client.usersbot.testRecommendation
    db2 = client.usersbot.tagWords
    db2.drop()
    db2.create_index([('word', TEXT)],unique=True)
    tagger = Tagger(language="it")
    count = 1
    for r in db1.find():
        #print(r)
        for adj in r['tagADJ']:
            try:
                db2.insert({'_id': count, 'word': adj,'type':'ADJ', 'count':1})
                count =count+1
            except errors.DuplicateKeyError as e:
                word = str(e).split("{ : ")[1].split(",")[0].replace("\"", "")
                try:
                    id = db2.find({'$text': {'$search':word, '$diacriticSensitive': False}})[0]['_id']
                except IndexError:
                    id = db2.find({'$text': {'$search': adj, '$diacriticSensitive': False}})[0]['_id']
                db2.update_one({"_id": id}, {"$inc": {"count":+1}})
        for noun in r['tagNOUN']:
            try:
                db2.insert({'_id': count, 'word': noun,'type':'NOUN', 'count':1})
                count =count+1
            except errors.DuplicateKeyError as e:
                # coding=utf-8
                #noun = noun.replace("å","à").encode().replace("xa1","xa0").decode()
                word = str(e).split("{ : ")[1].split(",")[0].replace("\"", "")
                try:
                    id = db2.find({'$text': {'$search':word, '$diacriticSensitive': False}})[0]['_id']
                except IndexError:
                    id = db2.find({'$text': {'$search': noun, '$diacriticSensitive': False}})[0]['_id']
                db2.update_one({"_id": id}, {"$inc": {"count": +1}})


    client.close()
def eliminate_enumerations(sentences):
    """
    This function eliminates enumerations from sentences
    :param sentences: the output from @process_text
    :param scores: the output from @assign_score_to_words
    :return: dict: keys: sentences with the eliminated enumerations if it's the case
		     values: for each sentence, the word that had the highest score or None if no enumeration was found
    """
    enum_regexp = re.compile(
        r'((\w+\-?\w+\s*\,\s*){2,100}\w+\-?\w+)|((\w+\-?\w+\s*\,\s*){1,100}\s*\w+\s+(si)\s+\w+)'
    )
    enum_regexp_special_case = re.compile(r'((\w+\-?\w+\s*\,\s*){2,100})')
    tagger = Tagger(language="ro")
    tagged_sentences = tagger.tag(sentences)
    sentences = nltk.sent_tokenize(sentences)

    # finding the enumerations
    enumerations = list()
    for sentence in sentences:
        sent_enums = [
            enum_regexp.findall(sentence),
            enum_regexp_special_case.findall(sentence)
        ]
        enumerations.append(sent_enums)

    # process the findall output and take only the full_match enum
    for i in range(0, len(enumerations)):
        if enumerations[i][0]:
            max_len = max([len(j) for j in enumerations[i][0][0]])
            max_len_index = [
                j for j in range(0, len(enumerations[i][0][0]))
                if len(enumerations[i][0][0][j]) == max_len
            ][0]
            enumerations[i][0] = enumerations[i][0][0][max_len_index]

        if enumerations[i][1]:
            max_len = max([len(j) for j in enumerations[i][1][0]])
            max_len_index = [
                j for j in range(0, len(enumerations[i][1][0]))
                if len(enumerations[i][1][0][j]) == max_len
            ][0]
            enumerations[i][1] = enumerations[i][1][0][max_len_index]

    # split the enumerations into tokens of words in tokenized_enums
    tokenized_enums = list()
    token_regex = re.compile(r"\w+-?\w*")
    for it in enumerations:
        if it != [[], []]:
            tokenized_enum = [
                token_regex.findall(str(it[0])),
                token_regex.findall(str(it[1]))
            ]
            tokenized_enums.append(tokenized_enum)
        else:
            tokenized_enums.append([[], []])

    # the output text
    new_text = ''

    # for each enumeartion
    for enumeration in range(0, len(enumerations)):

        # if they are not null
        if enumerations[enumeration] != [[], []]:

            # call the function that outputs the part of speech
            p_o_speech = get_part_of_speech_enum(
                tagged_sentences, tokenized_enums[enumeration][0])

            # check if the words from each enumeartion are NOUN, ADJ or ADV
            count = 0
            for enum_word in p_o_speech:
                if enum_word[1] == 'NOUN' or enum_word[
                        1] == 'ADJ' or enum_word[1] == 'ADV' or enum_word[
                            0].lower() == 'și' or enum_word[0].lower() == 'si':
                    count += 1
            # if they are then eliminate the enum from the sentence and put it in output text
            if count > 0 and count == len(p_o_speech):
                print(p_o_speech)
                best_score = max([globals.SCORES[i[0]] for i in p_o_speech])
                best_word = [
                    i[0] for i in p_o_speech
                    if globals.SCORES[i[0]] == best_score
                ][0]
                new_text += sentences[enumeration].replace(
                    enumerations[enumeration][0], " " + best_word + " ") + " "
                globals.ENUMERATIONS_REMOVED.append(
                    enumerations[enumeration][0])

            # do the same thing again for the special case if the regular case didn't match
            else:
                if tokenized_enums[enumeration][1]:
                    p_o_speech_special_case = get_part_of_speech_enum(
                        tagged_sentences, tokenized_enums[enumeration][1])
                    count = 0
                    for enum_word in p_o_speech_special_case:
                        if enum_word[1] == 'NOUN' or enum_word[
                                1] == 'ADJ' or enum_word[
                                    1] == 'ADV' or enum_word[0].lower(
                                    ) == 'și' or enum_word[0].lower() == 'si':
                            count += 1
                    # daca este enumeratie cs ce trebuie eliminata
                    if count == len(p_o_speech_special_case):
                        best_score = max([
                            globals.SCORES[i[0]]
                            for i in p_o_speech_special_case
                        ])
                        best_word = [
                            i[0] for i in p_o_speech_special_case
                            if globals.SCORES[i[0]] == best_score
                        ][0]
                        new_text += sentences[enumeration].replace(
                            enumerations[enumeration][1],
                            " " + best_word + " ") + " "
                        globals.ENUMERATIONS_REMOVED.append(
                            enumerations[enumeration][1])

        # if they are null then append to the key sentence the None value
        else:
            new_text += sentences[enumeration] + " "
    return new_text
示例#13
0
def getFeatures(gelen):
        
    
    yazi = list(filter(('').__ne__, gelen))
    
    # -----FEATURE 1 CÜMLE UZUNLUĞU -----#
    f1 = np.zeros(len(yazi))
    for i in  range(len(yazi)):
        cumleuzunluk = len(yazi[i].split())
        f1[i]=cumleuzunluk
    f1 = f1/max(f1)
    
    #----FEATURE 2 CÜMLE KONUMU ----#
    f2 = np.zeros(len(yazi))
    for i in  range(len(yazi)):
       f2[i] = ((len(yazi) - yazi.index(yazi[i]))/len(yazi))
        
    #----FEATURE 3 TERİM AĞIRLIĞI TF/ISF -----#
    
    f3 = np.zeros(len(yazi))
    tfidf = TfidfVectorizer().fit_transform(yazi)
    
    for i in  range(len(yazi)):
        f3[i] =(tfidf[i].sum())
    f3 = f3/max(f3)
    
    # FEATURE 4 ÖZEL İSİM (PROPER NOUR) ----#
    f5 = np.zeros(len(yazi))
    tagger = Tagger(language="english")
    for i in  range(len(yazi)):
        sayi = len([item for item in tagger.tag(yazi[i]) if item[1] == 'NOUN'])
        sayi = sayi / len(yazi[i].split())
        f5[i] = sayi
        
       
    #----FEATURE 5 TEMATİK KELİMELER ---#
    sw = getsw();
    c = Counter([i for i in ' '.join(yazi).lower().split() if i not in sw]).most_common(5)
    tematikler = [item[0] for item in c]
    f6 = np.zeros(len(yazi))
    for i in  range(len(yazi)):
        f6[i]=len(set(yazi[i].lower().split())&set(tematikler)) /len(yazi[i].split())
        
        
     #----FEATURE 6 numerik veriler ---#
    f7 = np.zeros(len(yazi))
    for i in  range(len(yazi)):
        f7[i] = len([int(s) for s in yazi[i].split() if s.isdigit()]) /len(yazi[i].split())
       
       
      #---- FEATURE 7 Cümle Benzerlik Skoru ---#
    f8 = np.zeros(len(yazi));
    tfidf = TfidfVectorizer().fit_transform(yazi)
    
    for i in range(len(yazi)):
        f8[i] = cosine_similarity(tfidf[i],tfidf).sum();
    f8 = f8 / max(f8)
    
    
    sutunlar= ['f1_uzunluk','f2_konum','f3_tfisf','f4_özelisim','f5_tematik','f6_numerik','f7_benzerlik']
    ind = [];
    for i in range(len(yazi)):
        ind.append(str(i));
    data = np.array([f1,f2,f3,f5,f6,f7,f8])
    
    Dframe = pd.DataFrame(data=data,index=sutunlar , columns=ind);
    dizi = Dframe.sum(axis=0).as_matrix()
    geridondur = []
    for t in range(len(dizi)):
        geridondur.append((dizi[t],t))
    #Dataframe ile doküman uzunluğu geri döndürülüyor !
    return geridondur
示例#14
0
def get_type(word):
    return Tagger(language='ro').tag(word)
示例#15
0
def remove_dialog(text, alpha):
    text = re.sub(r'[ \t]*-', '-', text)
    final_text = ""
    i = 0
    while i < len(text):
        if i == 0 and text[i] == "-":
            while i < len(text) and text[i] != '\n':
                i += 1
        elif text[i] == "-" and text[i - 1] == "\n":
            while i < len(text) and text[i] != '\n':
                i += 1
        else:
            final_text += text[i]
            i += 1

    temp_text = clean_pre_text(text)
    tagger = Tagger(language="ro")
    original_len = len(temp_text)
    paragraphs = str.splitlines(temp_text)
    word_multiple_tags = dict()
    for paragraph in paragraphs:
        if paragraph == " " or paragraph == "":
            continue
        first_non_whitespace_position = 0
        while first_non_whitespace_position < len(paragraph) and paragraph[
                first_non_whitespace_position] in [" ", "\n", "\t"]:
            first_non_whitespace_position += 1
        paragraph = paragraph[first_non_whitespace_position:]
        if paragraph[0] != "-":
            continue
        #print("-------------")
        paragraph = paragraph[1:]
        paragraph = re.sub(r'[-]', ' ', paragraph)
        temp_tags = tagger.tag(paragraph)

        add_words_using_class(paragraph)
        #TODO: DECOMMENT THIS AND TEST IT BEFORE PROD RELEASE

        right_tags = []
        for it in temp_tags:
            if it[1] not in ["PUNCT", ""]:
                right_tags.append(it)
        for word, tag in right_tags:
            if word not in word_multiple_tags.keys():
                word_multiple_tags[word] = dict()
            if tag not in word_multiple_tags[word].keys():
                word_multiple_tags[word][tag] = 1
            else:
                word_multiple_tags[word][tag] += 1

    word_tag = dict()
    for word, tags_and_nr in word_multiple_tags.items():
        nr_max = 0
        nr_total = 0
        real_tag = "CONJ"
        for tag, nr in tags_and_nr.items():
            nr_total += nr
            if nr > nr_max:
                nr_max = nr
                real_tag = tag
        word_tag[word] = (real_tag, nr_total)

    #print(word_multiple_tags)
    #print("-------")
    #print(word_tag)
    for word, tag_nr in word_tag.items():
        tag = tag_nr[0]
        nr = tag_nr[1]
        update_dict(word, tag, nr)
    new_len = len(final_text)
    dialog_len = original_len - new_len
    alpha_dialog_cut = dialog_len * 1.0 / original_len * 100
    if int(alpha_dialog_cut) >= 99 - alpha or alpha_dialog_cut >= 100:
        new_alpha = 101
    else:
        new_alpha = int(100 * alpha / (100 - alpha_dialog_cut))
    return final_text, new_alpha
示例#16
0
def __getattr__(name):
    if name == "SCORES":
        if _SCORES == {}:
            """
            Acts as a property, will be used as "globals.SCORES[word_romanian]"
            
            Function that assigns a specific score to words based on their sentence parts
                proper noun = +4 score
                noun = +2 score
                verb = +2 score
                other = +1 score

            :param words: a dictionary for the words, where keys are the word in romanian and words[key] is the information
                about the respective word (output from 'find_singularity' function)

            :return: dictionary where each pair (key, value) will be (word, score_of_word)
            """

            stop_words = nltk.corpus.stopwords.words('romanian')
            word_count, _, _ = find_singularity(ORIGINAL_TEXT)
            tagger = Tagger(language='ro')

            words_part_of_sent = dict()
            for sentence in nltk.sent_tokenize(ORIGINAL_TEXT):
                sentence = re.sub("[.,!?%^~$„”\"\']", "", sentence)
                sentence = re.sub(":", " ", sentence)

                sentence = tagger.tag(sentence)
                for word in sentence:
                    word_in_ro = word[0]
                    sentence_part = word[1]

                    if word_in_ro not in words_part_of_sent.keys():
                        words_part_of_sent[word_in_ro] = defaultdict(lambda: 0)

                    words_part_of_sent[word_in_ro][sentence_part] += 1

            for word in words_part_of_sent.keys():
                max_word_part_count = 0
                word_part = ""

                if word.lower() in stop_words:
                    _SCORES[word] = 1
                    continue

                if word in _SCORES.keys():
                    continue

                for part_of_sent in words_part_of_sent[word].keys():
                    count = words_part_of_sent[word][part_of_sent]

                    if count > max_word_part_count:
                        max_word_part_count = count
                        word_part = part_of_sent

                if word_part in scores_points.keys():
                    _SCORES[word] = word_count[word] * scores_points[word_part]

                else:
                    _SCORES[word] = word_count[word] * scores_points["OTHER"]

        return _SCORES
示例#17
0
def tag(tokens):
    tagger = Tagger(language=config.LANG_CODE)
    return tagger.tag(' '.join(tokens))