def prepare_testing_datasets_wiki(self, file_list_wiki, rd_folder_path): total_data = 0 actual_error = pd.DataFrame(columns=['actual', 'error']) for rf in file_list_wiki: if rf.endswith(".json"): try: revision_list = json.load( io.open(os.path.join(rd_folder_path, rf), encoding="utf-8")) for one_item in revision_list: old_value = str(one_item[0]['old_value'].strip()) old_value = remove_markup(str(old_value)) old_value = re.sub('[^a-zA-Z0-9.-]+', ' ', old_value) old_value = old_value.strip() new_value = str(one_item[0]['new_value'].strip()) new_value = remove_markup(str(new_value)) new_value = re.sub('[^a-zA-Z0-9.-]+', ' ', new_value) new_value = new_value.strip() if old_value and new_value and old_value != " " and new_value != " " and len( old_value ) > 3 and len( new_value ) > 3 and old_value != "none" and new_value != "none" and old_value != "None" and new_value != "None": actual_error.loc[-1] = [new_value, old_value] actual_error.index = actual_error.index + 1 # shifting index actual_error = actual_error.sort_index() total_data = total_data + 1 except Exception as e: print('Exception from wiki: ', str(e)) print("total_data: ", total_data) return actual_error
def prepare_domain_testing_datasets_wiki(self, file_list_wiki, rd_folder_path, domain_type): total_data = 0 if domain_type == "location": domain_location = [ 'Country', 'COUNTRY', 'country', 'CITY', 'City', 'city', 'Location', 'LOCATION', 'location', 'Place', 'PLACE', 'place', 'VENUE', 'venue', 'Venue', 'Town', 'town', 'TOWN', 'birth_place', 'death_place' ] actual_error = pd.DataFrame(columns=['actual', 'error']) for rf in file_list_wiki: if rf.endswith(".json"): try: revision_list = json.load( io.open(os.path.join(rd_folder_path, rf), encoding="utf-8")) for one_item in revision_list: if domain_location: if one_item[0][ 'errored_column'] in domain_location: old_value = str( one_item[0]['old_value'].strip()) old_value = remove_markup(str(old_value)) old_value = re.sub('[^a-zA-Z0-9.-]+', ' ', old_value) old_value = old_value.strip() new_value = str( one_item[0]['new_value'].strip()) new_value = remove_markup(str(new_value)) new_value = re.sub('[^a-zA-Z0-9.-]+', ' ', new_value) new_value = new_value.strip() if old_value and new_value and old_value != " " and new_value != " " and len( old_value ) > 3 and len( new_value ) > 3 and old_value != "none" and new_value != "none" and old_value != "None" and new_value != "None": actual_error.loc[-1] = [ new_value, old_value ] actual_error.index = actual_error.index + 1 # shifting index actual_error = actual_error.sort_index() total_data = total_data + 1 except Exception as e: print('Exception from wiki: ', str(e)) print("Total data to repair: ", total_data) return actual_error
def prepare_datasets_retrain_wiki( self, file_list_wiki, rd_folder_path): ###only for edit distance train_data_rows = [] for rf in file_list_wiki: if rf.endswith(".json"): try: revision_list = json.load( io.open(os.path.join(rd_folder_path, rf), encoding="utf-8")) one_item = revision_list[-1] #old_value=str(one_item[0]['old_value'].strip()) #new_value=str(one_item[0]['new_value'].strip()) dirty_table = one_item[0]['dirty_table'] for index, row in enumerate(dirty_table): if index == 0: continue row = remove_markup(str(row)) row = ast.literal_eval(row) row = list(filter(None, row)) row = [ x for x in row if not any(x1.isdigit() for x1 in x) ] if row: row = [ re.sub('[^a-zA-Z0-9.-]+', ' ', _) for _ in row ] train_data_rows.extend(row) except Exception as e: print('Exception: ', str(e)) return train_data_rows
def infobox_parsing(self): """ This method will extract all infobox templates with revision """ infobox_count = 0 templates = self.code.filter_templates() for temp in templates: json_list = [] if "Infobox" in temp.name: self.revision_page_folder_path = os.path.join( self.rdd_folder_path, self.page_folder) if not os.path.exists(self.revision_page_folder_path): os.mkdir(self.revision_page_folder_path) infobox_folder = remove_markup(str(temp.name)) infobox_folder = re.sub('[^a-zA-Z0-9\n\.]', ' ', (str(infobox_folder)).lower()) revision_infobox_folder_path = os.path.join( self.revision_page_folder_path, infobox_folder) if not os.path.exists(revision_infobox_folder_path): os.mkdir(revision_infobox_folder_path) json_list.append(str(temp)) json.dump( json_list, open( os.path.join( revision_infobox_folder_path, self.revision_id_parent + '_' + self.revision_id_current + ".json"), "w")) print(temp.name) infobox_count = infobox_count + 1 return infobox_count
def identity(self) -> typing.Optional[str]: # type: ignore[return] """ Returns Identity Section. """ idtparse = parse(self.main_content) for i in range(1, 7): idt = idtparse.get_sections(include_subsections=False, level=i) if idt != [] and (idt[0].title).lower() == "identity": return remove_markup(idt[0].contents)
def background(self) -> typing.Optional[str]: # type: ignore[return] """ Returns Background Section. """ bgparse = parse(self.main_content) for i in range(1, 7): bg = bgparse.get_sections(include_subsections=False, level=i) if bg != [] and (bg[0].title).lower() == "background": to_clean = bg[0].contents self.helper.content = to_clean return remove_markup(self.helper.clean_bg())
def table_parsing(self): """ This method will extract all table templates with revision """ table_count = 0 if self.table: for tebil in self.table: json_list = [] try: #TAG_RE=re.compile(r'<[^>]+>') #TAG_RE.sub(' ', name) table_caption = wtp.parse(str(tebil)).tables[0].caption table_folder_name = remove_markup(str(table_caption)) table_folder_name = table_folder_name.lower() table_folder_name = table_folder_name.strip() #table_folder_name=TAG_RE.sub(' ', table_folder_name) except Exception as e: print('Exception: table folder name ', str(e)) #print(str(e)) continue if table_caption: try: self.revision_page_folder_path = os.path.join( self.rdd_folder_path, self.page_folder) if not os.path.exists(self.revision_page_folder_path): os.mkdir(self.revision_page_folder_path) table_folder_name = table_folder_name.strip('\n') revision_table_folder_path = os.path.join( self.revision_page_folder_path, table_folder_name) revision_table_folder_path = revision_table_folder_path.strip( ) if not os.path.exists(revision_table_folder_path): os.mkdir(revision_table_folder_path) except Exception as e: print('Exception: revision table folder', str(e)) continue table_count = table_count + 1 json_list.append(str(tebil)) json.dump( json_list, open( os.path.join( revision_table_folder_path, self.revision_id_parent + '_' + self.revision_id_current + ".json"), "w")) print('Table caption: ', table_folder_name) table_count = table_count + 1 return table_count
def search(text): regex = re.compile(r'==\s*Vaata ka\s*') text = regex.split(text)[0] regex = re.compile(r'==\s*Kirjandus\s*==') text = regex.split(text)[0] regex = re.compile(r'==\s*Viited\s*==') text = regex.split(text)[0] regex = re.compile(r'==\s*Välislingid\s*==') text = regex.split(text)[0] code = mw.parse(trim_unnessessary_spaces(text)) for tag in code.filter_tags(recursive=False): if tag[0] == "{" and tag[-1] == "}" or tag.tag == 'gallery' or tag.tag == 'imagemap' or tag.tag == 'center' or tag.tag == 'ref': #or tag.tag == 'table': code.replace(tag,"") else: code.replace(tag,tag.contents) for argument in code.filter_arguments(): code.replace(argument, "") for comment in code.filter_comments(): code.replace(comment,"") for external_link in code.filter_external_links(): code.replace(external_link,"") for heading in code.filter_headings(): code.replace(heading,"") for html_entity in code.filter_html_entities(): code.replace(html_entity, html_entity.normalize()) for template in code.filter_templates(recursive=False): code.replace(template,"") for wikilink in code.filter_wikilinks(recursive=False): if bool(re.match("(File|Fail|Pilt|Image):.+\.(SVG|svg|JPEG|jpeg|GIF|gif|PNG|png|JPG|jpg)",str(wikilink.title))): code.replace(wikilink,"") answer = remove_markup(str(code)) splitted = answer.split('\n') for i,item in enumerate(splitted): splitted[i]=re.sub("\n", "",splitted[i]) answer = ''.join(splitted) return answer
def prepare_wiki_datasets_finetune(self, file_list_wiki, rd_folder_path, domain_type): train_data_rows = [] if domain_type == "location": domain_location = [ 'Country', 'COUNTRY', 'country', 'CITY', 'City', 'city', 'Location', 'LOCATION', 'location', 'Place', 'PLACE', 'place', 'VENUE', 'venue', 'Venue', 'Town', 'town', 'TOWN', 'birth_place', 'death_place' ] for rf in file_list_wiki: if rf.endswith(".json"): try: revision_list = json.load( io.open(os.path.join(rd_folder_path, rf), encoding="utf-8")) one_item = revision_list[-1] if domain_location: if one_item[0]['errored_column'] in domain_location: dirty_table = one_item[0]['dirty_table'] for index, row in enumerate(dirty_table): if index == 0: continue row = remove_markup(str(row)) row = ast.literal_eval(row) row = list(filter(None, row)) row = [ x for x in row if not any(x1.isdigit() for x1 in x) ] if row: row = [ re.sub('[^a-zA-Z0-9.-]+', ' ', _) for _ in row ] train_data_rows.append(row) except Exception as e: print('Exception from wiki: ', str(e)) txt = "" c = [[' '.join(i)] for i in train_data_rows] for sentence in c: txt = txt + sentence[0] + " ." with open("train_bert_wiki.txt", "w") as output: output.write(txt)
def masking_error_value(self, filelist, rd_folder_path): error_value = [] vicinity_with_amsk = [] mask = "[MASK]" for rf in filelist: if rf.endswith(".json"): try: revision_list = json.load( io.open(os.path.join(rd_folder_path, rf), encoding="utf-8")) #one_item=revision_list[-1] #old_value=str(one_item[0]['old_value'].strip()) #new_value=str(one_item[0]['new_value'].strip()) for one_item in revision_list: vicinity = one_item[0]['vicinity'] vicinity = remove_markup(str(vicinity)) vicinity = ast.literal_eval(vicinity) vicinity = list(filter(None, vicinity)) #print(vicinity[0]) error_value = one_item[0]['old_value'] #print('Before preprocess: ',vicinity) error_value = self.preprocess_text(error_value) #vicinity=preprocess_text(error_value) vicinity = [ self.preprocess_text(item) for item in vicinity ] print('Before Masking: ', vicinity) print('Error value : ', error_value) error_value = error_value.strip() vicinity = [ mask if str(x).strip() == str(error_value) else x for x in vicinity ] #vicinity = vicinity.replace(error_value, '**mask**') print('After masking : ', vicinity) except Exception as e: print('Exception: ', str(e))
import untangle import wikitextparser as wtp from wikitextparser import remove_markup, parse wiki_file = "./Wikipedia.xml" wiki_obj = untangle.parse(wiki_file) stories = wiki_obj.mediawiki.page #wiki_page = wtp.parse(stories[1].revision.text.cdata) for story in stories: story_text = wtp.parse(story.revision.text.cdata) plain_text = "" for section in story_text.sections: title = str(section.title).strip() #print("+"+title+"+") if title == "Plot" or title == "Synopsis": try: plain_text = remove_markup(section.string) except: pass print(plain_text)
def diff_check_revision(self): create_revision_list = [] table_column_current = None table_column_previous = None code_current = mwparserfromhell.parse(self.current_revision_file[0], skip_style_tags=True) code_previous = mwparserfromhell.parse(self.previous_revision_file[0], skip_style_tags=True) try: ########### Current revision table data extraction table1 = code_current.filter_tags( matches=lambda node: node.tag == "table") table_code_current = wtp.parse(str(table1[0])).tables[0] table_data_current = table_code_current.data() table_column_current = table_data_current[0] ########## previous revision table data extraction table2 = code_previous.filter_tags( matches=lambda node: node.tag == "table") table_code_previous = wtp.parse(str(table2[0])).tables[0] table_data_previous = table_code_previous.data() table_column_previous = table_data_previous[0] df_data = DataFrame(table_data_previous) header = df_data.iloc[0] new_column_list = header.tolist() df_data = df_data[1:] df_data.columns = header except Exception as e: print('Exception from table data: ', str(e)) if table_column_current and table_column_previous and len( table_column_previous) == len(set(table_column_previous)): self.table_count_with_error = self.table_count_with_error + 1 if len(table_column_current) == len(table_column_previous): text1 = table_data_previous text2 = table_data_current if text1 and text2: for index1, (txt1, txt2) in enumerate(zip(text1, text2)): #row parsing if index1 == 0: continue d = difflib.Differ() for index, (cell1, cell2) in enumerate(zip( txt1, txt2)): # values of row parsing create_revision_dict = {} old_value = None new_value = None cell1 = remove_markup(str(cell1)) cell2 = remove_markup(str(cell2)) cell1 = cell1.strip() cell2 = cell2.strip() #print(cell1) #print(cell2) if cell1 and cell2: diff1 = d.compare([''.join(cell1)], [cell2]) try: if diff1: for line in diff1: #print(line) #print('###############################################################################') if not line.startswith(' '): if line.startswith('-'): old_value = line[1:] if line.startswith('+'): new_value = line[1:] if old_value and new_value: #table_column_current1=remove_markup(str(table_column_current)) txt1 = remove_markup(str(txt1)) old_value = remove_markup( str(old_value)) new_value = remove_markup( str(new_value)) column_name = new_column_list[ index] column_name = str(column_name) #print(column_name) #print(type(column_name)) column_values = df_data[ column_name].tolist() column_values = remove_markup( str(column_values)) #value = html.unescape(value) #new_value = re.sub("[\t\n ]+", " ", new_value, re.UNICODE) #value = value.strip("\t\n ") cleanr = re.compile('<.*?>') all_column = list(df_data.columns) #all_column=html.unescape(str(all_column)) #all_column=remove_markup(str(all_column)) all_column = re.sub( cleanr, ' ', str(all_column)) all_column = remove_markup( all_column) column_name = re.sub( cleanr, ' ', str(column_name)) column_name = remove_markup( column_name) if len(old_value) < 50 and len( new_value) < 50: create_revision_dict = { "columns": all_column, "domain": column_values, "vicinity": txt1, "errored_column": column_name, "old_value": old_value, "new_value": new_value } create_revision_list.append( create_revision_dict) print('column: ', column_name, 'old cell: ', old_value, 'new_cell: ', new_value) except Exception as e: print('Exception from revised value: ', str(e)) return create_revision_list
def test_nested_bold_or_italic_plain_text(): assert remove_markup("''[[a|''b'']]") == 'b' assert remove_markup("'''[[a|'''b''']]") == 'b'
def test_remove_markup(): assert remove_markup("''a'' {{b}} c <!----> '''d'''") == "a c d"
def error_correction_fasttext_with_retrain_wiki(self, model_type, datasets_type, dataparam_1, dataparam_2): total_error = 0 total_error_to_repaired = 0 total_repaired = 0 if model_type == "Fasttext_All_Domain": #every time it will load the pretrained model to test new wiki table error_correction = self.prepare_testing_datasets_wiki( dataparam_1, dataparam_2 ) #dataparam_1 : json_list, dataparam_2: path of json_filelist model_fasttext = FastText.load("model/Fasttext_All_Domain.w2v") if model_type == "Fasttext_CV_Fold": model_fasttext = FastText.load("model/Fasttext_CV_Fold.w2v") if model_type == "Fasttext_Domain_Location": model_fasttext = FastText.load( "model/Fasttext_Location_Domain.w2v") error_correction = self.prepare_domain_testing_datasets_wiki( dataparam_1, dataparam_2, "location") total_error = self.calculate_total_error_wiki( dataparam_1, dataparam_2) if datasets_type == "wiki": train_data_rows = [] for rf in dataparam_1: if rf.endswith(".json"): try: revision_list = json.load( io.open(os.path.join(dataparam_2, rf), encoding="utf-8")) one_item = revision_list[-1] old_value = str(one_item[0]['old_value'].strip()) new_value = str(one_item[0]['new_value'].strip()) vicinity = one_item[0]['vicinity'] vicinity = remove_markup(str(vicinity)) vicinity = ast.literal_eval(vicinity) #print('Before: ',vicinity) train_vicinity_index = vicinity.index(old_value) del vicinity[train_vicinity_index] vicinity.append(new_value) vicinity = [ x for x in vicinity if not any(x1.isdigit() for x1 in x) ] vicinity = [x for x in vicinity if len(x) != 0 ] #remove empty item from list #vicinity=[re.sub('[^a-zA-Z0-9.-]+', ' ', _) for _ in vicinity] #print('After: ', vicinity) #row=list(filter(None, row)) dirty_table = one_item[0]['dirty_table'] for index, row in enumerate(dirty_table): if index == 0: continue shape = len(row) row = remove_markup(str(row)) row = ast.literal_eval(row) row = list(filter(None, row)) #remove all digit row = [ x for x in row if not any(x1.isdigit() for x1 in x) ] row = [x for x in row if len(x) != 0 ] #remove empty item from list if row: row = [ re.sub('[^a-zA-Z0-9.-]+', ' ', _) for _ in row ] train_data_rows.append(row) except Exception as e: print('Exception: ', str(e)) if train_data_rows: model_fasttext.build_vocab(train_data_rows, update=True) model_fasttext.train(sentences=train_data_rows, total_examples=len(train_data_rows), epochs=5) for error_value, actual_value in zip(error_correction['error'], error_correction['actual']): try: if model_type == "Fasttext_Domain_Location": pass else: total_error = total_error + 1 if not any(x1.isdigit() for x1 in error_value): total_error_to_repaired = total_error_to_repaired + 1 similar_value = model_fasttext.most_similar( error_value) #print('Actual value: ', actual_value,'Most similar value of : ',error_value, ' ' , similar_value) first, b = similar_value[0] #print('Error : ', error_value, ' Fixed: ', first, ' Actual: ', actual_value) first = first.strip() actual_value = actual_value.strip() if first == actual_value: print('Error : ', error_value, ' Fixed: ', first, ' Actual: ', actual_value) total_repaired = total_repaired + 1 except: continue print(total_error, total_error_to_repaired, total_repaired) model_type = model_type + ' retrain wiki ' self.evaluate_model(model_type, total_error, total_error_to_repaired, total_repaired)