def remove_dialog(text, alpha): text = re.sub(r'[ \t]*-', '-', text) final_text = "" i = 0 while i < len(text): if i == 0 and text[i] == "-": while i < len(text) and text[i] != '\n': i += 1 elif text[i] == "-" and text[i - 1] == "\n": while i < len(text) and text[i] != '\n': i += 1 else: final_text += text[i] i += 1 temp_text = clean_pre_text(text) tagger = Tagger(language="ro") original_len = len(temp_text) paragraphs = str.splitlines(temp_text) word_multiple_tags = dict() for paragraph in paragraphs: if paragraph == " " or paragraph == "": continue first_non_whitespace_position = 0 while first_non_whitespace_position < len(paragraph) and paragraph[ first_non_whitespace_position] in [" ", "\n", "\t"]: first_non_whitespace_position += 1 paragraph = paragraph[first_non_whitespace_position:] if paragraph[0] != "-": continue #print("-------------") paragraph = paragraph[1:] paragraph = re.sub(r'[-]', ' ', paragraph) temp_tags = tagger.tag(paragraph) add_words_using_class(paragraph) #TODO: DECOMMENT THIS AND TEST IT BEFORE PROD RELEASE right_tags = [] for it in temp_tags: if it[1] not in ["PUNCT", ""]: right_tags.append(it) for word, tag in right_tags: if word not in word_multiple_tags.keys(): word_multiple_tags[word] = dict() if tag not in word_multiple_tags[word].keys(): word_multiple_tags[word][tag] = 1 else: word_multiple_tags[word][tag] += 1 word_tag = dict() for word, tags_and_nr in word_multiple_tags.items(): nr_max = 0 nr_total = 0 real_tag = "CONJ" for tag, nr in tags_and_nr.items(): nr_total += nr if nr > nr_max: nr_max = nr real_tag = tag word_tag[word] = (real_tag, nr_total) #print(word_multiple_tags) #print("-------") #print(word_tag) for word, tag_nr in word_tag.items(): tag = tag_nr[0] nr = tag_nr[1] update_dict(word, tag, nr) new_len = len(final_text) dialog_len = original_len - new_len alpha_dialog_cut = dialog_len * 1.0 / original_len * 100 if int(alpha_dialog_cut) >= 99 - alpha or alpha_dialog_cut >= 100: new_alpha = 101 else: new_alpha = int(100 * alpha / (100 - alpha_dialog_cut)) return final_text, new_alpha
def tag(tokens): tagger = Tagger(language=config.LANG_CODE) return tagger.tag(' '.join(tokens))
def eliminate_enumerations(sentences): """ This function eliminates enumerations from sentences :param sentences: the output from @process_text :param scores: the output from @assign_score_to_words :return: dict: keys: sentences with the eliminated enumerations if it's the case values: for each sentence, the word that had the highest score or None if no enumeration was found """ enum_regexp = re.compile( r'((\w+\-?\w+\s*\,\s*){2,100}\w+\-?\w+)|((\w+\-?\w+\s*\,\s*){1,100}\s*\w+\s+(si)\s+\w+)' ) enum_regexp_special_case = re.compile(r'((\w+\-?\w+\s*\,\s*){2,100})') tagger = Tagger(language="ro") tagged_sentences = tagger.tag(sentences) sentences = nltk.sent_tokenize(sentences) # finding the enumerations enumerations = list() for sentence in sentences: sent_enums = [ enum_regexp.findall(sentence), enum_regexp_special_case.findall(sentence) ] enumerations.append(sent_enums) # process the findall output and take only the full_match enum for i in range(0, len(enumerations)): if enumerations[i][0]: max_len = max([len(j) for j in enumerations[i][0][0]]) max_len_index = [ j for j in range(0, len(enumerations[i][0][0])) if len(enumerations[i][0][0][j]) == max_len ][0] enumerations[i][0] = enumerations[i][0][0][max_len_index] if enumerations[i][1]: max_len = max([len(j) for j in enumerations[i][1][0]]) max_len_index = [ j for j in range(0, len(enumerations[i][1][0])) if len(enumerations[i][1][0][j]) == max_len ][0] enumerations[i][1] = enumerations[i][1][0][max_len_index] # split the enumerations into tokens of words in tokenized_enums tokenized_enums = list() token_regex = re.compile(r"\w+-?\w*") for it in enumerations: if it != [[], []]: tokenized_enum = [ token_regex.findall(str(it[0])), token_regex.findall(str(it[1])) ] tokenized_enums.append(tokenized_enum) else: tokenized_enums.append([[], []]) # the output text new_text = '' # for each enumeartion for enumeration in range(0, len(enumerations)): # if they are not null if enumerations[enumeration] != [[], []]: # call the function that outputs the part of speech p_o_speech = get_part_of_speech_enum( tagged_sentences, tokenized_enums[enumeration][0]) # check if the words from each enumeartion are NOUN, ADJ or ADV count = 0 for enum_word in p_o_speech: if enum_word[1] == 'NOUN' or enum_word[ 1] == 'ADJ' or enum_word[1] == 'ADV' or enum_word[ 0].lower() == 'și' or enum_word[0].lower() == 'si': count += 1 # if they are then eliminate the enum from the sentence and put it in output text if count > 0 and count == len(p_o_speech): print(p_o_speech) best_score = max([globals.SCORES[i[0]] for i in p_o_speech]) best_word = [ i[0] for i in p_o_speech if globals.SCORES[i[0]] == best_score ][0] new_text += sentences[enumeration].replace( enumerations[enumeration][0], " " + best_word + " ") + " " globals.ENUMERATIONS_REMOVED.append( enumerations[enumeration][0]) # do the same thing again for the special case if the regular case didn't match else: if tokenized_enums[enumeration][1]: p_o_speech_special_case = get_part_of_speech_enum( tagged_sentences, tokenized_enums[enumeration][1]) count = 0 for enum_word in p_o_speech_special_case: if enum_word[1] == 'NOUN' or enum_word[ 1] == 'ADJ' or enum_word[ 1] == 'ADV' or enum_word[0].lower( ) == 'și' or enum_word[0].lower() == 'si': count += 1 # daca este enumeratie cs ce trebuie eliminata if count == len(p_o_speech_special_case): best_score = max([ globals.SCORES[i[0]] for i in p_o_speech_special_case ]) best_word = [ i[0] for i in p_o_speech_special_case if globals.SCORES[i[0]] == best_score ][0] new_text += sentences[enumeration].replace( enumerations[enumeration][1], " " + best_word + " ") + " " globals.ENUMERATIONS_REMOVED.append( enumerations[enumeration][1]) # if they are null then append to the key sentence the None value else: new_text += sentences[enumeration] + " " return new_text
def getFeatures(gelen): yazi = list(filter(('').__ne__, gelen)) # -----FEATURE 1 CÜMLE UZUNLUĞU -----# f1 = np.zeros(len(yazi)) for i in range(len(yazi)): cumleuzunluk = len(yazi[i].split()) f1[i]=cumleuzunluk f1 = f1/max(f1) #----FEATURE 2 CÜMLE KONUMU ----# f2 = np.zeros(len(yazi)) for i in range(len(yazi)): f2[i] = ((len(yazi) - yazi.index(yazi[i]))/len(yazi)) #----FEATURE 3 TERİM AĞIRLIĞI TF/ISF -----# f3 = np.zeros(len(yazi)) tfidf = TfidfVectorizer().fit_transform(yazi) for i in range(len(yazi)): f3[i] =(tfidf[i].sum()) f3 = f3/max(f3) # FEATURE 4 ÖZEL İSİM (PROPER NOUR) ----# f5 = np.zeros(len(yazi)) tagger = Tagger(language="english") for i in range(len(yazi)): sayi = len([item for item in tagger.tag(yazi[i]) if item[1] == 'NOUN']) sayi = sayi / len(yazi[i].split()) f5[i] = sayi #----FEATURE 5 TEMATİK KELİMELER ---# sw = getsw(); c = Counter([i for i in ' '.join(yazi).lower().split() if i not in sw]).most_common(5) tematikler = [item[0] for item in c] f6 = np.zeros(len(yazi)) for i in range(len(yazi)): f6[i]=len(set(yazi[i].lower().split())&set(tematikler)) /len(yazi[i].split()) #----FEATURE 6 numerik veriler ---# f7 = np.zeros(len(yazi)) for i in range(len(yazi)): f7[i] = len([int(s) for s in yazi[i].split() if s.isdigit()]) /len(yazi[i].split()) #---- FEATURE 7 Cümle Benzerlik Skoru ---# f8 = np.zeros(len(yazi)); tfidf = TfidfVectorizer().fit_transform(yazi) for i in range(len(yazi)): f8[i] = cosine_similarity(tfidf[i],tfidf).sum(); f8 = f8 / max(f8) sutunlar= ['f1_uzunluk','f2_konum','f3_tfisf','f4_özelisim','f5_tematik','f6_numerik','f7_benzerlik'] ind = []; for i in range(len(yazi)): ind.append(str(i)); data = np.array([f1,f2,f3,f5,f6,f7,f8]) Dframe = pd.DataFrame(data=data,index=sutunlar , columns=ind); dizi = Dframe.sum(axis=0).as_matrix() geridondur = [] for t in range(len(dizi)): geridondur.append((dizi[t],t)) #Dataframe ile doküman uzunluğu geri döndürülüyor ! return geridondur