def pos_tagging(docs, stanford_path, pos_tagger): print("\nGenerating Part-of-Speech tags...") # Configuring Stanford NLP POS tagger path_to_model = "{}/models/{}.tagger".format(stanford_path, pos_tagger) path_to_jar = "{}/stanford-postagger.jar".format(stanford_path) tagger = StanfordPOSTagger(model_filename=path_to_model, path_to_jar=path_to_jar) # Setting higher memory limit for long sentences tagger.java_options = '-mx8192m' data = [] for doc in progressbar.progressbar(docs): # Obtain the list of tokens in the document tokens = [t for t, label in doc] try: # Perform POS tagging tagged = tagger.tag(tokens) except: continue # Take the word, POS tag, and its label data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)]) return data
def get_tagger(): ''' Set up & return the Stanford Tagger object.''' path_to_model = "/home/avery/Applications/stanford-postagger-2018-02-27/models/english-bidirectional-distsim.tagger" path_to_jar = "/home/avery/Applications/stanford-postagger-2018-02-27/stanford-postagger.jar" tagger = StanfordPOSTagger(path_to_model, path_to_jar) tagger.java_options = "-mx8192m" # Use: tagger.tag(word_tokenize(string)) return tagger
def pos_tagger(text): from nltk.tag.stanford import StanfordPOSTagger english_postagger = StanfordPOSTagger( 'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/models/english-left3words-distsim.tagger', 'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/stanford-postagger.jar' ) english_postagger.java_options = '-mx4096m' tags = english_postagger.tag(text) return tags
def posInput(text): print("POS") path_to_model = "./stanford-postagger/models/english-caseless-left3words-distsim.tagger" path_to_jar = "./stanford-postagger/stanford-postagger.jar" tagger=StanfordPOSTagger(path_to_model, path_to_jar) tagger.java_options='-mx4096m' ### Setting higher memory limit for long sentences # sentence = 'THIS IS TESTING' result = tagger.tag(word_tokenize(text)) # print result return result
def _POS(self, txt, id): self.df[['ID', 'pos']].to_csv('pos_ner.csv', sep='\t') path_pos = '/home/ise/NLP/stanfordNLP/stanford-postagger-full-2017-06-09/stanford-postagger.jar' model_path = '/home/ise/NLP/stanfordNLP/stanford-postagger-full-2017-06-09/models/english-bidirectional-distsim.tagger' from nltk.tag.stanford import StanfordPOSTagger tagger = StanfordPOSTagger(model_path, path_pos) tagger.java_options = '-mx8096m' ### Setting higher memory limit for long sentences tokens = nltk.word_tokenize(txt) pos_res = tagger.tag(tokens) filepath = '/home/ise/NLP/NLP3/pos/pos_{}.txt'.format(id) with open(filepath, 'w') as file_handler: for item in pos_res: file_handler.write("{}\n".format(item)) return pos_res
def transform_to_pos(text): import os #os.environ['JAVAHOME'] = java_path from nltk.corpus import sentiwordnet as swn from nltk.tag.stanford import StanfordPOSTagger from nltk import word_tokenize path_to_model = "./postagging/english-bidirectional-distsim.tagger" path_to_jar = "./postagging/stanford-postagger.jar" tagger = StanfordPOSTagger(path_to_model, path_to_jar) tagger.java_options = '-mx4096m' ### Setting higher memory limit for long sentences tokens = word_tokenize(text) size = len(tokens) from collections import Counter pos = tagger.tag(tokens) counts = Counter(tag for word, tag in pos) for key in counts: counts[key] /= size counts["totalWordsCount"] = size counts[";"] = tokens.count(";") / size counts["questionmarks"] = tokens.count("?") / size counts["exclamationmarks"] = tokens.count("!") / size counts["Quotes"] = tokens.count("\"") / size try: counts.pop(".") except: pass from collections import OrderedDict ot = [ 'NNP', 'VBD', 'VBN', 'IN', 'CD', 'VBP', ',', 'DT', 'NN', 'JJ', 'RB', 'TO', 'SYM', 'PRP', 'NNS', 'CC', 'PRP$', 'POS', 'FW', 'VBG', ':', 'WRB', 'EX', 'JJR', 'WDT', 'totalWordsCount', ';', 'questionmarks', 'exclamationmarks', 'Quotes' ] counts = OrderedDict(counts) for key in ot: if key in counts: pass else: counts[key] = 0 tmp = counts.copy() for key in tmp: if key not in ot: counts.pop(key, None) dab = {} for i in ot: dab[i] = counts[i] counts = dab.copy() return counts
def get_pos_sentence(sentences_spans, pos_vocab): """ Get POS tags for each sentence. (needed to build end2end system) :param start: :param end: :return: """ #raw_dir_simple = read.read_from_json('test/test_dir_simple') #### in folder data/ #raw_dir_simple = read.read_from_json('clinical_data/train_samples1_simples') #raw_dir_simple = read.read_from_json('agriculture_data/raw_dir_simple') #raw_dir_simple = ["NYT19980206.0466"] english_postagger = StanfordPOSTagger( 'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/models/english-left3words-distsim.tagger', #### in folder data/ 'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/stanford-postagger.jar' ) #### in folder data/ english_postagger.java_options = '-mx8000m' pos_sentences = list() for sent_span in sentences_spans: print sent_span[0] text = nltk.word_tokenize(sent_span[0]) text_pos = english_postagger.tag( text ) #####StanfordPnOSTagger failed to tag the underscore, see ttps://github.com/nltk/nltk/issues/1632 if use nltk 3.2.2, please change the code "word_tags = tagged_word.strip().split(self._SEPARATOR)" in function "parse_outputcode" of nltk.standford.py to "word_tags = tagged_word.strip().rsplit(self._SEPARATOR,1)" to handle undersocre issues index = 0 for token in text_pos: # if (text[index] != token[0]) and (token[0] == '``' or token[0] == "''"): ######### deal with the double quotes, in nltk.tokenize treebank.py change the tokenizer for double quotes. Reasons: (double quotes (") are changed to doubled single forward- and backward- quotes (`` and '')) # text_pos[index] = ["\"", "\'\'"] if text[index] == token[0] and token[0] == "``" and text[ index] not in sent_span[0]: text_pos[index] = ["\"", "``"] if text[index] == token[0] and token[0] == "''" and text[ index] not in sent_span[0]: text_pos[index] = ["\"", "\'\'"] if text[index] == token[0] and token[0] in ['{', '(', '[']: text_pos[index] = [token[0], "("] if text[index] == token[0] and token[0] in ['}', ')', ']']: text_pos[index] = [token[0], ")"] pos_vocab[token[1]] += 1 index += 1 pos_sentences.append(text_pos) return pos_sentences, pos_vocab
def pos_sentence(start=0, end=63): """ Get POS tags for each sentence. (needed to build end2end system) :param start: :param end: :return: """ raw_dir_simple = read.read_from_json( 'raw_dir_simple') #### in folder data/ english_postagger = StanfordPOSTagger( 'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/models/english-left3words-distsim.tagger', #### in folder data/ 'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/stanford-postagger.jar' ) #### in folder data/ english_postagger.java_options = '-mx4096m' pos = list() for data_id in range(start, end): sentences_spans = read.read_from_json("training_sentence/sentences/" + raw_dir_simple[data_id]) print raw_dir_simple[data_id] pos_sentences = list() for sent_span in sentences_spans: print sent_span[0] text = nltk.word_tokenize(sent_span[0]) k = english_postagger.tag( text ) #####StanfordPnOSTagger failed to tag the underscore, see ttps://github.com/nltk/nltk/issues/1632 if use nltk 3.2.2, please change the code "word_tags = tagged_word.strip().split(self._SEPARATOR)" in function "parse_outputcode" of nltk.standford.py into "word_tags = tagged_word.strip().rsplit(self._SEPARATOR,1)" to handle undersocre issues index = 0 for token in k: if (text[index] != token[0]) and ( token[0] == '``' or token[0] == "''" ): ######### deal with the double quotes, in nltk.tokenize treebank.py change the tokenizer for double quotes. Reasons: (double quotes (") are changed to doubled single forward- and backward- quotes (`` and '')) k[index] = ["\"", "\'\'"] if token[1] not in pos: pos.append(token[1]) index += 1 pos_sentences.append(k) read.save_in_json("training_sentence/pos/" + raw_dir_simple[data_id], pos_sentences) read.save_in_json("training_sentence/pos/pos_tag", pos)
def get_pos_sentence(sentences_spans,pos_vocab): """ Get POS tags for each sentence. (needed to build end2end system) :param start: :param end: :return: """ #raw_dir_simple = read.read_from_json('test/test_dir_simple') #### in folder data/ #raw_dir_simple = read.read_from_json('clinical_data/train_samples1_simples') #raw_dir_simple = read.read_from_json('agriculture_data/raw_dir_simple') #raw_dir_simple = ["NYT19980206.0466"] english_postagger = StanfordPOSTagger( StandforParser, #### in folder data/ StandforParser_jar) #### in folder data/ english_postagger.java_options = '-mx8000m' pos_sentences = list() for sent_span in sentences_spans: print(sent_span[0]) text = nltk.word_tokenize(sent_span[0]) text_pos = english_postagger.tag(text) #####StanfordPnOSTagger failed to tag the underscore, see https://github.com/nltk/nltk/issues/1632 if use nltk 3.2.2, please change the code "word_tags = tagged_word.strip().split(self._SEPARATOR)" in function "parse_outputcode" of nltk.standford.py to "word_tags = tagged_word.strip().rsplit(self._SEPARATOR,1)" to handle undersocre issues index = 0 for token in text_pos: # if (text[index] != token[0]) and (token[0] == '``' or token[0] == "''"): ######### deal with the double quotes, in nltk.tokenize treebank.py change the tokenizer for double quotes. Reasons: (double quotes (") are changed to doubled single forward- and backward- quotes (`` and '')) # text_pos[index] = ["\"", "\'\'"] if text[index] == token[0] and token[0] == "``" and text[index] not in sent_span[0]: text_pos[index] = ["\"", "``"] if text[index] ==token[0] and token[0] == "''" and text[index] not in sent_span[0]: text_pos[index] = ["\"", "\'\'"] if text[index] == token[0] and token[0] in ['{','(','['] : text_pos[index] = [token[0],"("] if text[index] == token[0] and token[0] in ['}',')',']']: text_pos[index] = [token[0],")"] pos_vocab[token[1]]+=1 index+=1 pos_sentences.append(text_pos) return pos_sentences,pos_vocab
def generate_pos(start=0, end=63): english_postagger = StanfordPOSTagger( 'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/models/english-left3words-distsim.tagger', 'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/stanford-postagger.jar' ) english_postagger.java_options = '-mx4096m' raw_text_dir = read.read_from_json('raw_data_dir') data_size = len(raw_text_dir) pos = list() for data_id in range(start, end): raw_text = read.read_from_dir(raw_text_dir[data_id]) print raw_text_dir[data_id] contents = list() for line in raw_text.splitlines(): print line text = nltk.word_tokenize(line) print text if len(text) == 0: k = [] else: k = english_postagger.tag(text) index = 0 for token in k: if (text[index] != token[0]) and ( token[0] == '``' or token[0] == "''" ): ######### deal with the double quotes, in nltk.tokenize treebank.py change the tokenizer for double quotes. Reasons: (double quotes (") are changed to doubled single forward- and backward- quotes (`` and '')) k[index] = ["\"", "\'\'"] if token[1] not in pos: pos.append(token[1]) index += 1 contents.append(k) read.save_json("data/pos/" + raw_text_dir[data_id].rsplit('\\', 1)[1], contents) read.save_in_json("pos_tag", pos)
from nltk.tag.stanford import StanfordPOSTagger import os path_to_model = os.path.join( os.getcwd(), "StanfordNLP/pos/models/english-bidirectional-distsim.tagger") path_to_jar = os.path.join(os.getcwd(), "StanfordNLP/pos/stanford-postagger.jar") POStagger = StanfordPOSTagger(path_to_model, path_to_jar) POStagger.java_options = '-mx4096m' ### Setting higher memory limit for long sentences from nltk.tokenize import word_tokenize def getPOSFocus(sentence): sentence = sentence.lower() result = [] tagged = POStagger.tag(word_tokenize(sentence)) for (word, cat) in tagged: if cat.startswith("NN") or cat.startswith("JJ"): result.append(word.lower()) return result if __name__ == '__main__': text = "Where is HEC( Himalayan Explorers Club ) office ?" print getPOSFocus(text)
def confidence(self, features): votes = [] for c in self._classifiers: v = c.classify(features) votes.append(v) choice_votes = votes.count(mode(votes)) conf = choice_votes / len(votes) return conf if __name__ == '__main__': path_to_model = 'stanford-postagger/models/english-bidirectional-distsim.tagger' path_to_jar = 'stanford-postagger/stanford-postagger.jar' tagger = StanfordPOSTagger(path_to_model, path_to_jar) tagger.java_options = '-mx512m' # Load the data-set pos_review = open('short_reviews/train/positive.txt', encoding='ISO-8859-1').readlines() neg_review = open('short_reviews/train/negative.txt', encoding='ISO-8859-1').readlines() multiThread_parse(pos_review, neg_review) # Save all the adjectives to a file with open("trained_models/documents.pickle", "wb") as save_documents: pickle.dump(documents, save_documents) all_words = nltk.FreqDist(all_words) print(len(list(all_words.keys()))) word_features = list(all_words.keys())[:6000]
# https://stackoverflow.com/questions/34692987/cant-make-stanford-pos-tagger-working-in-nltk # https://nlp.stanford.edu/software/tagger.html # http://www.nltk.org/_modules/nltk/tag/stanford.html#CoreNLPPOSTagger from nltk.tag.stanford import StanfordPOSTagger current_path = os.path.dirname(os.path.realpath(__file__)) path_to_model = "input/stanford/stanford-postagger-full-2018-10-16/models/english-bidirectional-distsim.tagger" path_to_jar = "input/stanford/stanford-postagger-full-2018-10-16/stanford-postagger.jar" path_to_model = os.path.join(current_path, '..', path_to_model) path_to_jar = os.path.join(current_path, '..', path_to_jar) standford_tagger = StanfordPOSTagger(path_to_model, path_to_jar) standford_tagger.java_options = '-mx1024m' ### Setting higher memory limit for long sentences # https://pythonprogramming.net/named-entity-recognition-stanford-ner-tagger/ from nltk.tag import StanfordNERTagger path_to_model = "input/stanford/stanford-ner-2014-08-27/classifiers/english.all.3class.distsim.crf.ser.gz" path_to_jar = "input/stanford/stanford-ner-2014-08-27/stanford-ner.jar" path_to_model = os.path.join(current_path, '..', path_to_model) path_to_jar = os.path.join(current_path, '..', path_to_jar) standford_ner = StanfordNERTagger(path_to_model, path_to_jar) standford_ner.java_options = '-mx1024m' ### Setting higher memory limit for long sentences
def calculate_nouns_per_doc_per_part(rootdirectory, df_file_from_pos): from pos import prepare_full_doc import pandas as pd import re import os from log_creating import df_into_csv from nltk.tag.stanford import StanfordPOSTagger """ This function lets you calculate number of words per part per article per corpus; calculate nouns per part per article per corpus """ df = pd.DataFrame({ 'Subcorpus': [], 'Type of Article': [], 'Article': [], 'Teil': [], 'WordsN': [], 'NounsN': [] }) print("df is created", df) t = '' Teil1 = "<Intro>" Teil2 = "<Middle>" Teil3 = "<Conclusion>" path_to_model = "C:/Users/kole021/Downloads/stanford-postagger-full-2017-06-09/models/english-bidirectional-distsim.tagger" path_to_jar = "C:/Users/kole021/Downloads/stanford-postagger-full-2017-06-09/stanford-postagger.jar" tagger = StanfordPOSTagger(path_to_model, path_to_jar) java_path = "C:\\Program Files\\Java\\jdk1.8.0_181\\bin\\java.exe" os.environ['JAVAHOME'] = java_path tagger.java_options = '-mx4096m' for subdir, dirs, files in os.walk(rootdirectory): for file in files: #print(subdir) sub = re.sub(".*/", "", subdir) filepath = subdir + os.sep + file # if (filepath.endswith(".txt")) & ( file.startswith("log.txt") == False) & (file.startswith("README.txt") == False): word_count = 0 CNC_up_3_count = 0 words_for_CNCs = 0 print(sub) print(filepath) print(file) print("Processing " + file + " in " + sub) t = prepare_full_doc(filepath) sentences = re.split("\.|\?|:|;", t) last_first_word = "" noun_count = 0 word_number = 0 Teile = { Teil1: [0, 0], Teil2: [0, 0], Teil3: [0, 0] } #0 = words number; 1 = nouns number; Current_teil = Teil1 for sent in sentences: p = [] sent = re.sub('\n', ' ', sent) ###add more deleting enters try: p = tagger.tag(re.split( " ", sent)) #list of tuples for every sentence except: print("failed to tag the sentence: " + sent) for word in p: word_number += 1 if ((len(word[0]) > 2) & (re.search("[\d@+]", word[0]) == None) & (re.search("\(.\)", word[0]) == None)) & ( (word[1] == 'NN') | (word[1] == 'NNS')): noun_count += 1 if word[0] == Teil2: print("End of Introduction") print("Teil 1", word_number, noun_count) Teile[Teil1][0] = word_number Teile[Teil1][1] = noun_count word_number = 0 noun_count = 0 Current_teil = Teil2 elif word[0] == Teil3: print("End of Middle") print("Teil 2", word_number, noun_count) Teile[Teil2][0] = word_number Teile[Teil2][1] = noun_count word_number = 0 noun_count = 0 Current_teil = Teil3 print("Teil 3", word_number, noun_count) print("End of Conclusion") Teile[Teil3][0] = word_number Teile[Teil3][1] = noun_count word_number = 0 noun_count = 0 df = df.append( { 'Subcorpus': sub, 'Type of Article': "", 'Article': file, 'Teil': "<Intro>", 'WordsN': Teile[Teil1][0], 'NounsN': Teile[Teil1][1] }, ignore_index=True) df = df.append( { 'Subcorpus': sub, 'Type of Article': "", 'Article': file, 'Teil': "<Middle>", 'WordsN': Teile[Teil2][0], 'NounsN': Teile[Teil2][1] }, ignore_index=True) df = df.append( { 'Subcorpus': sub, 'Type of Article': "", 'Article': file, 'Teil': "<Conclusion>", 'WordsN': Teile[Teil3][0], 'NounsN': Teile[Teil3][1] }, ignore_index=True) print(df) ##for_changes = pd.read_csv(df_file_from_pos, sep = '\t', header = 0) ##table = pd.pivot_table(for_changes, index = ['Article', 'Teil'], values = 'CNC Length', aggfunc = np.sum) ##print(type(table)) df_into_csv(df, "df_nouns_and_all_words_counts_whole_corpus_part_2.csv")
def get_CNCs_up_3_of_a_txt_file(df2, df, filepath, file, sub, parts_directory): """ Gets all CNCs of one text and creates files in parts folder for them """ #print("get_CNCs_up_3_of_a_txt_file",df) import os import pandas as pd from nltk.tag.stanford import StanfordPOSTagger path_to_model = "C:/Users/kole021/Downloads/stanford-postagger-full-2017-06-09/models/english-bidirectional-distsim.tagger" path_to_jar = "C:/Users/kole021/Downloads/stanford-postagger-full-2017-06-09/stanford-postagger.jar" tagger=StanfordPOSTagger(path_to_model, path_to_jar) java_path = "C:\\Program Files\\Java\\jdk1.8.0_181\\bin\\java.exe" os.environ['JAVAHOME'] = java_path tagger.java_options='-mx4096m' from log_creating import write_log import re words_for_CNCs = 0 word_count = 0 CNC_up_3_count = 0 noun_count = 0 if (filepath.endswith(".txt"))&(file.startswith("log.txt")==False)&(file.startswith("README.txt")==False): word_count = 0 noun_count = 0 Teil1 = "<Intro>" Teil2 = "<Middle>" Teil3 = "<Conclusion>" Ende = "<Ending>" Teile2 = {Teil1: [0, 0, 0], Teil2: [0, 0, 0], Teil3: [0, 0, 0]} CNC_up_3_count = 0 words_for_CNCs = 0 print(sub) print (filepath) print(file) write_log(parts_directory, "Processing " + file + " in " + sub) t = prepare_full_doc(filepath) sentences = re.split("\.|\?|:|;", t) sent_number = 0 word_number = 0 Teile = {Teil1: [0, 0, 0, 0], Teil2: [0, 0, 0, 0], Teil3: [0, 0, 0, 0]} #0 = word count; 1 = CNC p 3 count; 2 = how many words contain CNCs; 3=what part of all words that are in CNCs are the CNCs in this particular part Current_teil = Teil1 for sent in sentences: p = [] sent = re.sub('\n', ' ',sent ) ###add more deleting enters sent_number += 1 try: p = tagger.tag(re.split(" ", sent)) #list of tuples for every sentence except: write_log(parts_directory, "failed to tag the sentence: " + sent) maxCNC = 0 maxCNCstring = "" last_first_word = "" other_word = False for word in p: #print(word) word_number += 1 #print(word_number) word_count += 1 ##print("word_count = ", word_count) if ((len(word[0])>2) & (re.search("[\d@+]", word[0])==None) & (re.search("\(.\)",word[0])==None))&( (word[1] == 'NN')|(word[1] == 'NNS')|( ((word[1] == 'JJ')|(word[1] == 'JJR')|(word[1] == 'JJS'))&(maxCNC == 0) ) ): word_word = word[0] if ((len(word[0])>2) & (re.search("[\d@+]", word[0])==None) & (re.search("\(.\)",word[0])==None))&( (word[1] == 'NN')|(word[1] == 'NNS') ): noun_count += 1 #print(word) if re.search("/", word_word)==None: word_word = re.split("/",word_word)[0] if other_word == True: if last_first_word != "": maxCNCstring = last_first_word + " " + word_word last_first_word = "" maxCNC = 2 other_word = False else: other_word = False CNC_up_3_count, words_for_CNCs, df = create_maxCNC_txt_in_parts_if_up3_and_count(df, words_for_CNCs, CNC_up_3_count, maxCNCstring, maxCNC, parts_directory, sub, file, Current_teil, sent_number, word_number, sentences) maxCNC = 1 maxCNCstring = word_word else: if ((word_word[len(word_word)-1]==")")|(word_word[len(word_word)-1]=="’")|(word_word[len(word_word)-1]=="’")|(word_word[len(word_word)-1]==",")|(word_word[len(word_word)-1]==";")|(word_word[len(word_word)-1]=="'")|(word_word[len(word_word)-1]=="'")|(word_word[len(word_word)-1]==":")): other_word = True maxCNC += 1 maxCNCstring += " " + word_word last_first_word = "" else: try: if re.search("[:\"\(\[]", word_word[0])!=None: last_first_word = word_word other_word = True else: maxCNC += 1 maxCNCstring += " " + word_word except: write_log(parts_directory, "Fail to process ( in word_word: " + word_word) ##if ((word_word[len(word_word)-2]==")")|(word_word[len(word_word)-2]=="’")|(word_word[len(word_word)-2]=="’")|(word_word[len(word_word)-2]==",")|(word_word[len(word_word)-2]==";")|(word_word[len(word_word)-2]=="'")|(word_word[len(word_word)-2]=="'")|(word_word[len(word_word)-2]==":")): ## other_word = True else: other_word = True last_first_word = "" if re.search(Teil2, word[0])!=None: write_log(parts_directory, "End of Introduction") print("Teil 1", word_count, CNC_up_3_count, words_for_CNCs) Teile[Teil1][0] = word_count Teile[Teil1][1] = CNC_up_3_count Teile[Teil1][2] = words_for_CNCs Teile2[Teil1][0] = word_count Teile2[Teil1][1] = noun_count word_count = 0 noun_count = 0 CNC_up_3_count = 0 words_for_CNCs = 0 Current_teil = Teil2 elif re.search(Teil3, word[0])!=None: write_log(parts_directory, "End of Middle") print("Teil 2", word_count, CNC_up_3_count, words_for_CNCs) Teile[Teil2][0] = word_count Teile[Teil2][1] = CNC_up_3_count Teile[Teil2][2] = words_for_CNCs Teile2[Teil2][0] = word_count Teile2[Teil2][1] = noun_count noun_count = 0 word_count = 0 CNC_up_3_count = 0 words_for_CNCs = 0 Current_teil = Teil3 elif re.search(Ende, word[0])!=None: print(word[0], " THIS IS WORD 0 BY ENDE") print("Teil 3", word_count, CNC_up_3_count, words_for_CNCs) write_log(parts_directory, "End of Conclusion") Teile[Teil3][0] = word_count Teile[Teil3][1] = CNC_up_3_count Teile[Teil3][2] = words_for_CNCs Teile2[Teil3][0] = word_count Teile2[Teil3][1] = noun_count word_count = 0 noun_count = 0 df2 = df2.append({'Subcorpus': sub, 'Type of Article': "", 'Article': file, 'Teil': "<Intro>", 'WordsN': Teile2[Teil1][0], 'NounsN': Teile2[Teil1][1], 'CNCsNounsN':Teile[Teil1][2]}, ignore_index=True) df2 = df2.append({'Subcorpus': sub, 'Type of Article': "", 'Article': file, 'Teil': "<Middle>", 'WordsN': Teile2[Teil2][0], 'NounsN': Teile2[Teil2][1], 'CNCsNounsN':Teile[Teil2][2]}, ignore_index=True) df2 = df2.append({'Subcorpus': sub, 'Type of Article': "", 'Article': file, 'Teil': "<Conclusion>", 'WordsN': Teile2[Teil3][0], 'NounsN': Teile2[Teil3][1], 'CNCsNounsN':Teile[Teil3][2]}, ignore_index=True) print("new line in df2:", df2) CNC_up_3_count = 0 words_for_CNCs = 0 sum_CNCs = 0 sum_total = 0 word_count = 0 noun_count = 0 for teil in Teile.keys(): # print("teil in Keys()", teil) # write_log(parts_directory, "words in the " + teil + str(Teile[teil][0])) # write_log(parts_directory, "CNCs up 3 in the " + teil + str(Teile[teil][1])) # write_log(parts_directory, "words that contains CNCs(N>=3) in " + teil + str(Teile[teil][2])) sum_CNCs += Teile[teil][2] sum_total += Teile[teil][0] full_ratio = sum_CNCs/sum_total for teil in Teile.keys(): try: Teile[teil][3] = Teile[teil][2]/Teile[teil][0] except ZeroDivisionError: Teile[teil][3] = 0 write_log(parts_directory, "words that contains CNCs(N>=3) in " + teil + " are " + str(Teile[teil][3]) + " \% of this teil") write_log(parts_directory, "In this document CNCs >=3 are " + str(full_ratio) + "\% of the text.") return df, df2