示例#1
0
def remove_dialog(text, alpha):
    text = re.sub(r'[ \t]*-', '-', text)
    final_text = ""
    i = 0
    while i < len(text):
        if i == 0 and text[i] == "-":
            while i < len(text) and text[i] != '\n':
                i += 1
        elif text[i] == "-" and text[i - 1] == "\n":
            while i < len(text) and text[i] != '\n':
                i += 1
        else:
            final_text += text[i]
            i += 1

    temp_text = clean_pre_text(text)
    tagger = Tagger(language="ro")
    original_len = len(temp_text)
    paragraphs = str.splitlines(temp_text)
    word_multiple_tags = dict()
    for paragraph in paragraphs:
        if paragraph == " " or paragraph == "":
            continue
        first_non_whitespace_position = 0
        while first_non_whitespace_position < len(paragraph) and paragraph[
                first_non_whitespace_position] in [" ", "\n", "\t"]:
            first_non_whitespace_position += 1
        paragraph = paragraph[first_non_whitespace_position:]
        if paragraph[0] != "-":
            continue
        #print("-------------")
        paragraph = paragraph[1:]
        paragraph = re.sub(r'[-]', ' ', paragraph)
        temp_tags = tagger.tag(paragraph)

        add_words_using_class(paragraph)
        #TODO: DECOMMENT THIS AND TEST IT BEFORE PROD RELEASE

        right_tags = []
        for it in temp_tags:
            if it[1] not in ["PUNCT", ""]:
                right_tags.append(it)
        for word, tag in right_tags:
            if word not in word_multiple_tags.keys():
                word_multiple_tags[word] = dict()
            if tag not in word_multiple_tags[word].keys():
                word_multiple_tags[word][tag] = 1
            else:
                word_multiple_tags[word][tag] += 1

    word_tag = dict()
    for word, tags_and_nr in word_multiple_tags.items():
        nr_max = 0
        nr_total = 0
        real_tag = "CONJ"
        for tag, nr in tags_and_nr.items():
            nr_total += nr
            if nr > nr_max:
                nr_max = nr
                real_tag = tag
        word_tag[word] = (real_tag, nr_total)

    #print(word_multiple_tags)
    #print("-------")
    #print(word_tag)
    for word, tag_nr in word_tag.items():
        tag = tag_nr[0]
        nr = tag_nr[1]
        update_dict(word, tag, nr)
    new_len = len(final_text)
    dialog_len = original_len - new_len
    alpha_dialog_cut = dialog_len * 1.0 / original_len * 100
    if int(alpha_dialog_cut) >= 99 - alpha or alpha_dialog_cut >= 100:
        new_alpha = 101
    else:
        new_alpha = int(100 * alpha / (100 - alpha_dialog_cut))
    return final_text, new_alpha
示例#2
0
def tag(tokens):
    tagger = Tagger(language=config.LANG_CODE)
    return tagger.tag(' '.join(tokens))
def eliminate_enumerations(sentences):
    """
    This function eliminates enumerations from sentences
    :param sentences: the output from @process_text
    :param scores: the output from @assign_score_to_words
    :return: dict: keys: sentences with the eliminated enumerations if it's the case
		     values: for each sentence, the word that had the highest score or None if no enumeration was found
    """
    enum_regexp = re.compile(
        r'((\w+\-?\w+\s*\,\s*){2,100}\w+\-?\w+)|((\w+\-?\w+\s*\,\s*){1,100}\s*\w+\s+(si)\s+\w+)'
    )
    enum_regexp_special_case = re.compile(r'((\w+\-?\w+\s*\,\s*){2,100})')
    tagger = Tagger(language="ro")
    tagged_sentences = tagger.tag(sentences)
    sentences = nltk.sent_tokenize(sentences)

    # finding the enumerations
    enumerations = list()
    for sentence in sentences:
        sent_enums = [
            enum_regexp.findall(sentence),
            enum_regexp_special_case.findall(sentence)
        ]
        enumerations.append(sent_enums)

    # process the findall output and take only the full_match enum
    for i in range(0, len(enumerations)):
        if enumerations[i][0]:
            max_len = max([len(j) for j in enumerations[i][0][0]])
            max_len_index = [
                j for j in range(0, len(enumerations[i][0][0]))
                if len(enumerations[i][0][0][j]) == max_len
            ][0]
            enumerations[i][0] = enumerations[i][0][0][max_len_index]

        if enumerations[i][1]:
            max_len = max([len(j) for j in enumerations[i][1][0]])
            max_len_index = [
                j for j in range(0, len(enumerations[i][1][0]))
                if len(enumerations[i][1][0][j]) == max_len
            ][0]
            enumerations[i][1] = enumerations[i][1][0][max_len_index]

    # split the enumerations into tokens of words in tokenized_enums
    tokenized_enums = list()
    token_regex = re.compile(r"\w+-?\w*")
    for it in enumerations:
        if it != [[], []]:
            tokenized_enum = [
                token_regex.findall(str(it[0])),
                token_regex.findall(str(it[1]))
            ]
            tokenized_enums.append(tokenized_enum)
        else:
            tokenized_enums.append([[], []])

    # the output text
    new_text = ''

    # for each enumeartion
    for enumeration in range(0, len(enumerations)):

        # if they are not null
        if enumerations[enumeration] != [[], []]:

            # call the function that outputs the part of speech
            p_o_speech = get_part_of_speech_enum(
                tagged_sentences, tokenized_enums[enumeration][0])

            # check if the words from each enumeartion are NOUN, ADJ or ADV
            count = 0
            for enum_word in p_o_speech:
                if enum_word[1] == 'NOUN' or enum_word[
                        1] == 'ADJ' or enum_word[1] == 'ADV' or enum_word[
                            0].lower() == 'și' or enum_word[0].lower() == 'si':
                    count += 1
            # if they are then eliminate the enum from the sentence and put it in output text
            if count > 0 and count == len(p_o_speech):
                print(p_o_speech)
                best_score = max([globals.SCORES[i[0]] for i in p_o_speech])
                best_word = [
                    i[0] for i in p_o_speech
                    if globals.SCORES[i[0]] == best_score
                ][0]
                new_text += sentences[enumeration].replace(
                    enumerations[enumeration][0], " " + best_word + " ") + " "
                globals.ENUMERATIONS_REMOVED.append(
                    enumerations[enumeration][0])

            # do the same thing again for the special case if the regular case didn't match
            else:
                if tokenized_enums[enumeration][1]:
                    p_o_speech_special_case = get_part_of_speech_enum(
                        tagged_sentences, tokenized_enums[enumeration][1])
                    count = 0
                    for enum_word in p_o_speech_special_case:
                        if enum_word[1] == 'NOUN' or enum_word[
                                1] == 'ADJ' or enum_word[
                                    1] == 'ADV' or enum_word[0].lower(
                                    ) == 'și' or enum_word[0].lower() == 'si':
                            count += 1
                    # daca este enumeratie cs ce trebuie eliminata
                    if count == len(p_o_speech_special_case):
                        best_score = max([
                            globals.SCORES[i[0]]
                            for i in p_o_speech_special_case
                        ])
                        best_word = [
                            i[0] for i in p_o_speech_special_case
                            if globals.SCORES[i[0]] == best_score
                        ][0]
                        new_text += sentences[enumeration].replace(
                            enumerations[enumeration][1],
                            " " + best_word + " ") + " "
                        globals.ENUMERATIONS_REMOVED.append(
                            enumerations[enumeration][1])

        # if they are null then append to the key sentence the None value
        else:
            new_text += sentences[enumeration] + " "
    return new_text
示例#4
0
def getFeatures(gelen):
        
    
    yazi = list(filter(('').__ne__, gelen))
    
    # -----FEATURE 1 CÜMLE UZUNLUĞU -----#
    f1 = np.zeros(len(yazi))
    for i in  range(len(yazi)):
        cumleuzunluk = len(yazi[i].split())
        f1[i]=cumleuzunluk
    f1 = f1/max(f1)
    
    #----FEATURE 2 CÜMLE KONUMU ----#
    f2 = np.zeros(len(yazi))
    for i in  range(len(yazi)):
       f2[i] = ((len(yazi) - yazi.index(yazi[i]))/len(yazi))
        
    #----FEATURE 3 TERİM AĞIRLIĞI TF/ISF -----#
    
    f3 = np.zeros(len(yazi))
    tfidf = TfidfVectorizer().fit_transform(yazi)
    
    for i in  range(len(yazi)):
        f3[i] =(tfidf[i].sum())
    f3 = f3/max(f3)
    
    # FEATURE 4 ÖZEL İSİM (PROPER NOUR) ----#
    f5 = np.zeros(len(yazi))
    tagger = Tagger(language="english")
    for i in  range(len(yazi)):
        sayi = len([item for item in tagger.tag(yazi[i]) if item[1] == 'NOUN'])
        sayi = sayi / len(yazi[i].split())
        f5[i] = sayi
        
       
    #----FEATURE 5 TEMATİK KELİMELER ---#
    sw = getsw();
    c = Counter([i for i in ' '.join(yazi).lower().split() if i not in sw]).most_common(5)
    tematikler = [item[0] for item in c]
    f6 = np.zeros(len(yazi))
    for i in  range(len(yazi)):
        f6[i]=len(set(yazi[i].lower().split())&set(tematikler)) /len(yazi[i].split())
        
        
     #----FEATURE 6 numerik veriler ---#
    f7 = np.zeros(len(yazi))
    for i in  range(len(yazi)):
        f7[i] = len([int(s) for s in yazi[i].split() if s.isdigit()]) /len(yazi[i].split())
       
       
      #---- FEATURE 7 Cümle Benzerlik Skoru ---#
    f8 = np.zeros(len(yazi));
    tfidf = TfidfVectorizer().fit_transform(yazi)
    
    for i in range(len(yazi)):
        f8[i] = cosine_similarity(tfidf[i],tfidf).sum();
    f8 = f8 / max(f8)
    
    
    sutunlar= ['f1_uzunluk','f2_konum','f3_tfisf','f4_özelisim','f5_tematik','f6_numerik','f7_benzerlik']
    ind = [];
    for i in range(len(yazi)):
        ind.append(str(i));
    data = np.array([f1,f2,f3,f5,f6,f7,f8])
    
    Dframe = pd.DataFrame(data=data,index=sutunlar , columns=ind);
    dizi = Dframe.sum(axis=0).as_matrix()
    geridondur = []
    for t in range(len(dizi)):
        geridondur.append((dizi[t],t))
    #Dataframe ile doküman uzunluğu geri döndürülüyor !
    return geridondur