def build_ngram_unigram_dictionaries(transformed_token_list, correct_token_list, incorrect_token_list, ngram_to_unigram_dictionary, unigram_to_ngram_dictionary): ngram_tuples = get_frequent_ngrams(transformed_token_list, p.ngram_occurence_freq) for ngram in ngram_tuples: merged_ngram = "" split_ngram_string = "" dash_separated_string = "" for t in list(ngram): merged_ngram = (merged_ngram + t).strip() split_ngram_string = (split_ngram_string + " " + t).strip() dash_separated_string = (dash_separated_string + "-" + t).strip() # check if the ngram forms a unigram when merged if merged_ngram in correct_token_list: ngram_to_unigram_dictionary.append([ngram, merged_ngram]) elif merged_ngram in incorrect_token_list: unigram_to_ngram_dictionary.append( [merged_ngram, split_ngram_string, dash_separated_string]) # print to dictionary files print_to_file(v.ngram_to_unigram_dictionary_path, ngram_to_unigram_dictionary, v.ngram_to_unigram_headings) print_to_file(v.unigram_to_ngram_dictionary_path, unigram_to_ngram_dictionary, v.unigram_to_ngram_headings)
def semantic_transformation(data_path, sheet_name, columns, short_text_name): """Prints corpus with semantic transformation applied""" wo_data = pd.read_excel(data_path, sheet_name=sheet_name) selected_wo_data = pd.DataFrame(wo_data, columns=["ShortText"]) short_text_list = selected_wo_data[short_text_name] # just get short text # Step 1: Tokenization # Generates a token list with punctuation removed transformed_text_list = [] for short_text in short_text_list: tokenized = tokenization(short_text) new_text = '' for token in tokenized: new_text += token.lower() new_text += ' ' # Step 2: Semantic Transformation # Generates a token list transformed against regex matches transformed_text = semantic_transform(new_text) transformed_text_list.append(transformed_text) # Write output to file print_to_file(v.transformed_text_path_stage_1, transformed_text_list, v.transformed_text_heading)
def lemmatisation(): # open preprocessed tokens wo_data = pd.read_excel(v.input_file_path_lemmatisation, sheet_name=v.input_file_sheet_name) selected_wo_data = pd.DataFrame(wo_data, columns=v.input_file_columns) transformed_token_list = list(selected_wo_data[v.input_file_column]) # create list of tokens token_list = [] for sentence in transformed_token_list: tokens = sentence.split(' ') for token in tokens: token_list.append(token) token_set = list(set(token_list)) final_sentences = [] for sentence in transformed_token_list: tokens = sentence.split(' ') final_tokens = [] for w in tokens: final_word = w lemmatized_word = Lem.lemmatize(w) if len(w) > 3 and lemmatized_word != w: if lemmatized_word in tokens: final_word = lemmatized_word elif len(w) > 4 and w[-1] == 's' and w[:-1] in token_set: final_word = lemmatized_word final_tokens.append(final_word) final_sentences.append(' '.join(final_tokens)) print_to_file(v.transformed_text_path_stage_3, final_sentences, v.transformed_text_heading)
def abbreviation_correction(): # open preprocessed tokens wo_data = pd.read_excel(v.input_file_path_abbreviation, sheet_name=v.input_file_sheet_name) selected_wo_data = pd.DataFrame(wo_data, columns=v.input_file_columns) transformed_token_list = list(selected_wo_data[v.input_file_column]) known_abbreviation_list = [['lh', 'left-hand'], ['rh', 'right-hand'], ['flh', 'front-left-hand'], ['rlh', 'rear-left-hand'], ['frh', 'front-right-hand'], ['rrh', 'rear-right-hand'], ['rr', 'rear-right'], ['rl', 'rear-left'], ['fr', 'front-right'], ['fl', 'front-left'], ['rhs', 'right-hand-side'], ['lhs', 'left-hand-side'], ['hr', 'hour'], ['wk', 'week']] known_trigram_abbreviation_list = [ ['u', '_', 's', 'unservicable'], ['c', '_', 'o', 'changeout'], ['d', '_', 's', 'drivers'], ['a', '_', 'c', 'air conditioning'], ['l', '_', 'h', 'left-hand'], ['r', '_', 'h', 'right-hand'], ] final_sentences = [] for sentence in transformed_token_list: tokens = sentence.split(' ') final_tokens = [] flag = 0 i = 0 for index, token in enumerate(tokens): final_token = token # check if in abbreviation list for abbrev, full in known_abbreviation_list: if abbrev == token: final_token = full # check if in trigram list for trigram in known_trigram_abbreviation_list: if (index != 0 and index != (len(tokens) - 1) and tokens[index - 1] == trigram[0] and tokens[index] == trigram[1] and tokens[index + 1] == trigram[2]): final_token = trigram[3] flag = 1 i = index final_tokens.append(final_token) if (flag == 1): final_tokens.pop(i + 1) final_tokens.pop(i - 1) final_sentences.append(' '.join(final_tokens)) print_to_file(v.transformed_text_path_stage_4, final_sentences, v.transformed_text_heading)
def baseline_tagging(transformed_text_list): tagged_records = []; for sentence in transformed_text_list: s = sentence.strip().split(" ") flag = False while flag is False: if '' in s: s.remove('') if '' not in s: flag = True # regular expression used for noun phrase chunking grammar = "NP: {<NN.*>*}" cp = nltk.RegexpParser(grammar) # tagged_s is a list of tuples consisting of the word and its pos tag if '+' not in s: tagged_s = nltk.pos_tag(s) for c, word_pos_tag_tuple in enumerate(tagged_s): word, pos_tag = word_pos_tag_tuple # only searching for the original verb if 'VB' in pos_tag: s[c] = word + '=' elif 'JJ' in pos_tag: s[c] = word + '#' #noun phrase chunking for items detection result = cp.parse(tagged_s) for subtree in result.subtrees(): if subtree.label() == 'NP': t = subtree noun_phrase_chunk = ' '.join(word for word, pos in t.leaves()) tagged_noun_phrase_chunk = '~'.join(word for word, pos in t.leaves()) starting_index_noun_phrase_chunk = position_of_ngram(tuple(noun_phrase_chunk.split()), s) s[starting_index_noun_phrase_chunk] = tagged_noun_phrase_chunk for i in range(1, len(t.leaves())): s[starting_index_noun_phrase_chunk + i] = '' s = [x for x in s if x] string_to_print = ' '.join(s) tagged_records.append(string_to_print) else: string_to_print = ' '.join(s) tagged_records.append(string_to_print) print_to_file(v.baseline_output_path, tagged_records, v.output_headings)
def spelling_correction(): """Performs spelling correction on transformed token list""" # open preprocessed tokens wo_data = pd.read_excel(v.input_file_path_spelling_correction, sheet_name=v.input_file_sheet_name) selected_wo_data = pd.DataFrame(wo_data, columns=v.input_file_columns) transformed_token_list = list(selected_wo_data[v.input_file_column]) transformed_stage_1 = stage_1(transformed_token_list) transformed_stage_2 = stage_2(transformed_stage_1) transformed_stage_3 = stage_3(transformed_stage_2) print_to_file(v.transformed_text_path_stage_2, transformed_stage_3, v.transformed_text_heading)
def stage_2(transformed_token_list): """Check tokens afaint unigram to ngram dictionary""" dict_data = pd.read_excel(v.stage_2_input_path, sheet_name=v.input_file_sheet_name) selected_correct_token_data = pd.DataFrame( dict_data, columns=v.stage_2_input_file_columns) transformed_stage_2 = [] for sentence in transformed_token_list: for row in selected_correct_token_data.itertuples(): unigram = row.unigram.strip() if unigram in sentence.split(' '): sentence = sentence.replace(unigram, row.ngram) transformed_stage_2.append(sentence) print_to_file(v.stage_2_output_path, transformed_stage_2, v.input_file_columns) return transformed_stage_2
def build_correct_incorrect_token_dictionaries(transformed_token_list, correct_token_list, incorrect_token_list): for transformed_sentence in transformed_token_list: tokens = transformed_sentence.strip().split(' ') if " " in tokens: tokens.remove(" ") correct_token_list.extend(spell.known(tokens)) incorrect_token_list.extend(spell.unknown(tokens)) # print correct and incorrect token lists to file print_to_file(v.correct_token_dictionary_path, list(set(correct_token_list)), v.correct_token_heading) print_to_file(v.incorrect_token_dictionary_path, list(set(incorrect_token_list)), v.incorrect_token_heading)
def stage_1(): '''Filter out all ngrams that contain a symptom/state, maint activity or stopwords''' filtered_ngrams = [] # Get frequent ngrams ngram_data = pd.read_excel(v.all_tagged_frequent_ngrams_path, sheet_name=v.input_file_sheet_name) frequent_ngram_data = pd.DataFrame(ngram_data, columns=v.ngrams_headings) # Filter ngrams stop_words = stopwords.words('english') stop_words = stop_words + ['right', 'left', 'front', 'rear', 'top', 'bottom', 'right-hand', 'left-hand', 'hand', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'hour', 'day', 'week', 'month', 'year'] # get incorrect token list dict_data = pd.read_excel(v.incorrect_token_dictionary_path, sheet_name=v.input_file_sheet_name) incorrect_token_data = pd.DataFrame(dict_data, columns=v.incorrect_token_heading) incorrect_token_list = list(incorrect_token_data[v.incorrect_token_heading[0]]) for index, row in frequent_ngram_data.iterrows(): row['headword'] = get_proper_string(row['headword']) row['tailword1'] = get_proper_string(row['tailword1']) row['tailword2'] = get_proper_string(row['tailword2']) row['tailword3'] = get_proper_string(row['tailword3']) row['tailword4'] = get_proper_string(row['tailword4']) row['tailword5'] = get_proper_string(row['tailword5']) combined = (row['headword'] + ' ' + row['tailword1'] + ' ' + row['tailword2'] + ' '+row['tailword3']+ ' ' +row['tailword4']+' '+row['tailword5']).strip() if (v.symptom_state_tag_symbol not in combined) and (v.maintenance_activity_tag_symbol not in combined): # if the record in not a symptom or an activity (or already tagged as a maintenance item combined_tokens = combined.split() found_flag = 0 # check if contains "common stopwords" for token in combined_tokens: if token in stop_words or token in incorrect_token_list: found_flag = 1 if (found_flag == 0): filtered_ngrams.append(combined_tokens) print_to_file(v.maint_item_filtering_stage_1_path, filtered_ngrams, ['headword','tailword1', 'tailword2', 'tailword3', 'tailword4', 'tailword5'])
def main(): print("Starting tagging: maintenance_item") preprocessed_data = pd.read_excel(v.maintenance_activity_output_path, sheet_name=v.input_file_sheet_name) selected_data = pd.DataFrame(preprocessed_data, columns=v.output_headings) transformed_text_list = list(selected_data[v.output_heading]) ngrams = get_frequent_ngrams(transformed_text_list, p.ngram_occurence_freq) print_to_file(v.all_tagged_frequent_ngrams_path, ngrams, v.ngrams_headings) stage_1() print("stage 1 complete") if sys.argv[1] == "1": generate_word_embeddings() # long running operation stage_2() print("stage 2 complete") tagging(transformed_text_list) print("tagging: maintenance item tagging is complete") print('THE PROCESSING PIPELINE HAS COMPLETED SUCCESSFULLY')
def keyword_based_ngram_filtering(): filtered_ngrams = [] headwords = ['cannot', 'not', 'is', 'are'] ngram_data = pd.read_excel(v.all_frequent_ngrams_path, sheet_name=v.input_file_sheet_name) df = pd.DataFrame(ngram_data, columns=v.ngrams_headings) for index, row in df.iterrows(): if row['headword'] != '' and row['headword'] in headwords: filtered_ngrams.append([ row['headword'], row['tailword1'], row['tailword2'], row['tailword3'], row['tailword4'], row['tailword5'] ]) if row['headword'] == 'to' and row['tailword1'] == 'be': filtered_ngrams.append([ row['headword'], row['tailword1'], row['tailword2'], row['tailword3'], row['tailword4'] ]) print_to_file(v.symptom_state_filtered_ngrams_path, filtered_ngrams, v.ngrams_headings)
def stage_1(transformed_token_list): """Checks tokens against ngram to unigram dictionary""" dict_data = pd.read_excel(v.stage_1_input_path, sheet_name=v.input_file_sheet_name) selected_correct_token_data = pd.DataFrame( dict_data, columns=v.stage_1_input_file_columns) transformed_state_1 = [] for sentence in transformed_token_list: for row in selected_correct_token_data.itertuples(): b = list(literal_eval(row.ngram)) ngram = '' for word in b: ngram += (' ' + word) split_bigram = ngram.strip().split(' ') split_sentence = sentence.strip().split(' ') if ngram.strip() in sentence and split_bigram[ 0] in split_sentence and split_bigram[1] in split_sentence: sentence = sentence.replace(ngram.strip(), row.unigram) transformed_state_1.append(sentence) print_to_file(v.stage_1_output_path, transformed_state_1, v.input_file_columns) return transformed_state_1
def detect_activities(transformed_text_list, dictionary_list): tagged_records = [] try: conjugate('hello', 'inf') # dirty fix to python 3.7 / pattern error except: pass for sentence in transformed_text_list: if type(sentence) != float: # skip if nan? tokens = sentence.split(' ') for idx, token in enumerate(tokens): if v.symptom_state_tag_symbol not in token: # if it has not already been tagged as a symptom/state conjugated_current_word = conjugate(token, 'inf') if conjugated_current_word in dictionary_list: tokens[idx] = token + v.maintenance_activity_tag_symbol tagged_records.append(' '.join(tokens)) else: tagged_records.append('') print_to_file(v.maintenance_activity_output_path, tagged_records, v.output_headings)
def tagging(transformed_text_list): tagged_records = [] dictionary_data = pd.read_excel(v.symptom_state_dictionary_path, sheet_name=v.input_file_sheet_name) dictionary_df = pd.DataFrame(dictionary_data, columns=v.dictionary_headings) dictionary_list = [] for index, row in dictionary_df.iterrows(): dictionary_list.append(row['words'].split(' ')) for sentence in transformed_text_list: tokens = sentence.strip().split(' ') total_ngrams = [] for n in range(2, 7): total_ngrams = total_ngrams + list(ngrams(tokens, n)) tagged, flag = tag_record(tokens, total_ngrams, dictionary_list) # else if single term matches then tag # case 1: single term from describing nouns # case 2: single term from dictionary if (flag == 0): for index, token in enumerate(tokens): for row in dictionary_list: if len(row) == 1 and token == row[0] and len(token) > 3: tokens[index] = token + v.symptom_state_tag_symbol if (flag == 1): joined = ''.join(w if (w.endswith(v.symptom_state_tag_symbol) and ( i - 1 != len(tagged) and tagged[i].endswith(v.symptom_state_tag_symbol))) else w + ' ' for i, w in enumerate(tagged)).lstrip() tagged_records.append(joined) else: tagged_records.append(' '.join(tokens)) print_to_file(v.symptom_state_output_path, tagged_records, v.output_headings)
def dictionary_building(): symptom_state_dictionary = [] # step 1: get filtered ngrams and append to dictionary n_data = pd.read_excel(v.symptom_state_filtered_ngrams_path, sheet_name=v.input_file_sheet_name) df = pd.DataFrame(n_data, columns=v.ngrams_headings) for index, row in df.iterrows(): row['headword'] = get_proper_string(row['headword']) row['tailword1'] = get_proper_string(row['tailword1']) row['tailword2'] = get_proper_string(row['tailword2']) row['tailword3'] = get_proper_string(row['tailword3']) row['tailword4'] = get_proper_string(row['tailword4']) row['tailword5'] = get_proper_string(row['tailword5']) symptom_state_dictionary.append( (row['headword'] + ' ' + row['tailword1'] + ' ' + row['tailword2'] + ' ' + row['tailword3'] + ' ' + row['tailword4'] + ' ' + row['tailword5']).strip()) # step 2: append tailwords to dictionary if headword is "is" or "are" if row['headword'] == "is" or row['headword'] == "are": words = (row['tailword1'] + ' ' + row['tailword2'] + ' ' + row['tailword3'] + ' ' + row['tailword4'] + ' ' + row['tailword5']).strip() symptom_state_dictionary.append(words) # step 3: append additional describing nouns to dictionary symptom_state_dictionary.append('problem') symptom_state_dictionary.append('error') symptom_state_dictionary.append('leak') symptom_state_dictionary.append('fault') symptom_state_dictionary.append('damage') symptom_state_dictionary.append('failure') print_to_file(v.symptom_state_dictionary_path, symptom_state_dictionary, v.dictionary_headings)
def ngram_detection(text_list): ngrams = get_frequent_ngrams(text_list, p.ngram_occurence_freq) print_to_file(v.all_frequent_ngrams_path, ngrams, v.ngrams_headings)
def tagging(transformed_text_list): tagged_records = [] # check against fluid dicitionay dictionary_data = pd.read_excel(v.maint_item_filtering_stage_2_path, sheet_name=v.input_file_sheet_name) dictionary_df = pd.DataFrame(dictionary_data, columns=v.dictionary_headings) dictionary_list = [] for index, row in dictionary_df.iterrows(): dictionary_list.append(row['words'].split(' ')) counter = 0; for sentence in transformed_text_list: counter = counter + 1; #print(counter) if type(sentence) != float: tokens = sentence.strip().split(' ') total_ngrams = [] for n in range(7, 1, -1): total_ngrams = total_ngrams + list( ngrams(tokens, n)) ## REVERSE SO THAT LONGER N-GRAMS TAGGED FIRST tagged, flag = tag_record(tokens, total_ngrams, dictionary_list) # else if single term matches then tag if (flag == 0): for index, token in enumerate(tokens): for row in dictionary_list: if len(row) == 1 and token == row[0]: tokens[index] = token + v.maintenance_item_tag_symbol if (flag == 1): joined = ''.join( w if (w.endswith(v.maintenance_item_tag_symbol)) else w + ' ' for i, w in enumerate(tagged)).lstrip() tagged_records.append(joined) print(joined) else: tagged_records.append(' '.join(tokens)) else: tagged_records.append('') print_to_file(v.maintenance_item_tagging_1, tagged_records, v.output_headings) tagged_records = [] preprocessed_data = pd.read_excel(v.maintenance_item_tagging_1, sheet_name=v.input_file_sheet_name) selected_data = pd.DataFrame(preprocessed_data, columns=v.output_headings) transformed_text_list = list(selected_data[v.output_heading]) # check against static dictionary print('step 2 begin') dictionary_data_static = pd.read_excel(v.maint_item_static_dictionary_path, sheet_name=v.input_file_sheet_name) dictionary_df_static = pd.DataFrame(dictionary_data_static, columns=['words']) dictionary_list = [] for index, row in dictionary_df_static.iterrows(): dictionary_list.append(row['words'].split(' ')) counter = 0; for sentence in transformed_text_list: counter = counter+1; #print(counter) if type(sentence) != float: tokens = sentence.strip().split(' ') total_ngrams = [] for n in range(7, 1, -1): total_ngrams = total_ngrams + list(ngrams(tokens, n)) ## REVERSE SO THAT LONGER N-GRAMS TAGGED FIRST tagged, flag = tag_record(tokens, total_ngrams, dictionary_list) # else if single term matches then tag if (flag == 0): for index, token in enumerate(tokens): for row in dictionary_list: if len(row) == 1 and token == row[0]: tokens[index] = token + v.maintenance_item_tag_symbol #print(token) if (flag == 1): joined = ''.join( w if (w.endswith(v.maintenance_item_tag_symbol)) else w + ' ' for i, w in enumerate(tagged)).lstrip() tagged_records.append(joined) #print(joined) else: tagged_records.append(' '.join(tokens)) else: tagged_records.append('') #print(tagged_records) print_to_file(v.maintenance_item_tagging_2, tagged_records, v.output_headings) # combine tags tagged_records = [] print('step 3 begin') counter = 0; preprocessed_data = pd.read_excel(v.maintenance_item_tagging_2, sheet_name=v.input_file_sheet_name) selected_data = pd.DataFrame(preprocessed_data, columns=v.output_headings) transformed_text_list = list(selected_data[v.output_heading]) for sentence in transformed_text_list: counter = counter+1 #print(counter) if type(sentence) != float: tagged_sentence = '' tokens = sentence.strip().split(' ') #print(tokens) i = 0 while i <= len(tokens) - 1: if i != 0 and (v.maintenance_item_tag_symbol in tokens[i]) and (v.maintenance_item_tag_symbol in tokens[i-1]): # check backward #print('herexx') print(tokens) if (tokens[i].endswith('~')): tokens[i-1]=tokens[i-1]+tokens[i] else: tokens[i-1] = tokens[i-1] + '~' + tokens[i] del tokens[i] if i == len(tokens) - 1: i = i + 1; #print('here3'); elif i != len(tokens) - 1 and tokens[i].endswith(v.maintenance_item_tag_symbol) and (v.maintenance_item_tag_symbol in tokens[i+1]): #print('here1') tokens[i] = tokens[i]+tokens[i+1] del tokens[i+1] i = i + 1; elif i != len(tokens) -1 and (v.maintenance_item_tag_symbol in tokens[i+1]) and (v.maintenance_item_tag_symbol in tokens[i]): #print('here2') tokens[i] = tokens[i]+'~'+tokens[i+1] del tokens[i + 1] #print(tokens[i]) i = i + 1; else: i = i+1; #if (i == len(tokens)): # i = i-1; #remove last word tag if ngram if i != len(tokens) and tokens[i].count('~') > 1 and tokens[i].endswith('~'): tokens[i] = tokens[i][:-1] #(tokens) joined = ' '.join(tokens).lstrip() #print(joined) tagged_records.append(joined) else: tagged_records.append('+') #print(tagged_records) print_to_file(v.maintenance_item_output_path, tagged_records, v.output_headings)
def stage_2(): '''Filter outlier words with word2vec''' outlier_words_dict = defaultdict(int) outlier_words_pos_filtered_dict = defaultdict(int) single_word_freq_dict = defaultdict(int) # get maintenance item dict ngram_data = pd.read_excel(v.maint_item_filtering_stage_1_path, sheet_name=v.input_file_sheet_name) maintenance_items = pd.DataFrame(ngram_data, columns=v.ngrams_headings) # get preprocessed maitenance records preprocessed_data = pd.read_excel(v.transformed_text_path_stage_4, sheet_name=v.input_file_sheet_name) selected_data = pd.DataFrame(preprocessed_data, columns=v.input_file_columns) transformed_text_list = list(selected_data[v.input_file_columns]) # get generated model model = gensim.models.Word2Vec.load(v.word_2_vec_model_path ) for index, row in maintenance_items.iterrows(): row['headword'] = get_proper_string(row['headword']) row['tailword1'] = get_proper_string(row['tailword1']) row['tailword2'] = get_proper_string(row['tailword2']) row['tailword3'] = get_proper_string(row['tailword3']) row['tailword4'] = get_proper_string(row['tailword4']) row['tailword5'] = get_proper_string(row['tailword5']) combined = (row['headword'] + ' ' + row['tailword1'] + ' ' + row['tailword2'] + ' '+row['tailword3']+ ' ' +row['tailword4']+' '+row['tailword5']).strip() parts_list = combined.split(' ') number_of_parts = len(parts_list) for a in parts_list: #print(parts_list) single_word_freq_dict[a] += 1 if a not in model.wv.vocab: number_of_parts -= 1 if number_of_parts == 0: outlier_word = '' pos = '' dist_mean = 0 else: outlier_word , dist_mean = customized_doesnt_match(model.wv, parts_list) outlier_words_dict[outlier_word] += 1 outlier_words_pos_filtered_dict[outlier_word] += 1 outlier_words_pos_filtered_dic_ranked_by_ratio = {} outlier_words_ratio_2 = {} for item in outlier_words_dict: freq, ratio = 0, 0 if item in single_word_freq_dict: freq = single_word_freq_dict[item] ratio_unfiltered = outlier_words_dict[item] / freq ratio = outlier_words_pos_filtered_dict[item] / freq outlier_words_pos_filtered_dic_ranked_by_ratio[item] = ratio outlier_words_ratio_2[item] = ratio_unfiltered ratio_threshold = 0.8 stopwords_stage_2 = [a for a in outlier_words_pos_filtered_dic_ranked_by_ratio if outlier_words_pos_filtered_dic_ranked_by_ratio[a] > ratio_threshold] print(stopwords_stage_2) final = [] for index, row in maintenance_items.iterrows(): row['headword'] = get_proper_string(row['headword']) row['tailword1'] = get_proper_string(row['tailword1']) row['tailword2'] = get_proper_string(row['tailword2']) row['tailword3'] = get_proper_string(row['tailword3']) row['tailword4'] = get_proper_string(row['tailword4']) row['tailword5'] = get_proper_string(row['tailword5']) combined = (row['headword'] + ' ' + row['tailword1'] + ' ' + row['tailword2'] + ' ' + row['tailword3'] + ' ' + row['tailword4'] + ' ' + row['tailword5']).strip() found = 0; combined_list = [row['headword'], row['tailword1'], row['tailword2'], row['tailword3'], row['tailword4'], row['tailword5']] for stopword in stopwords_stage_2: if stopword in combined_list: print(combined_list) found = 1; if found == 0: final.append(combined) print_to_file(v.maint_item_filtering_stage_2_path, final, v.dictionary_headings);