def parse_epub(filename: str, abbr: bool, code: bool): """ Parse an epub file """ book = epub.read_epub(filename) title = book.get_metadata('DC', 'title')[0][0] remove_hashtags = title in TITLES_REMOVE_HASHTAGS # indicate to remove hashtags print('\nParsing book "{0}"'.format(title)) list_plaintexts = [] counter_abbrs = Counter() for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT): name = item.get_name() if not re.match(REGEX_CHAPITRE, name): print('...Ignoring {0}'.format(name)) continue print('...Parsing {0}'.format(name)) # parse and clean chapter plaintext, abbrs = clean_epub_item(item, abbr, code, remove_hashtags) list_plaintexts.append(plaintext) counter_abbrs += Counter(abbrs) book_plaintext = '\n\n\n'.join(list_plaintexts) # replace numbers book_plaintext = filter_numbers(book_plaintext) # normalize book_plaintext = maybe_normalize(book_plaintext) if abbr: print('Abbreviation counts:\n{0}'.format(counter_abbrs.items())) return book_plaintext
def fetch_play_text(url): text = [] if url and len(url) > 0: if 'libretheatre.fr' in url: text = fetch_play_text_libretheatre(url) elif 'wikisource.org' in url: text = fetch_play_text_wikisource(url) finaltext = [] for line in text: line = maybe_normalize(line) line = maybe_normalize(line, mapping=mapping_specific) line = filter_numbers(line) line = line.strip() maybe_matches = re.finditer(PUNCT_NBSP, line) for maybe_match in maybe_matches: line = line.replace( maybe_match.group(0), "%s\u00a0%s" % (maybe_match.group(1), maybe_match.group(2))) finaltext += [line] return finaltext
def fetch_play_text(url): text = [] if url and len(url) > 0: if 'libretheatre.fr' in url: text = fetch_play_text_libretheatre(url) elif 'wikisource.org' in url: text = fetch_play_text_wikisource(url) finaltext = [] for line in text: line = maybe_normalize(line) line = maybe_normalize(line, mapping=mapping_specific) line = filter_numbers(line) line = line.strip() line = line.replace("\n", " ") finaltext += [line] return finaltext
def format_address(address, template): # NB: zipcode is sometime pronounced in 3 parts # ex: 75001 => soixante quinze zero zero un # and sometime pronounced in 2 parts # ex: 01090 => zero un quatre vingt dix # see unit tests for more info zipcode = address['zipcode'] zipcode_alt = '{}{}, {}{}{}'.format(*zipcode) address.update( zipcode=zipcode_alt if zipcode.startswith('0') else zipcode, zipcode_alt=zipcode_alt, ) str = template.format(street_lower='{}{}'.format( address['street'][0].lower(), address['street'][1:]), **address) str = maybe_normalize(str, mapping=normalizers) str = filter_numbers(str) return str.strip()
if args.one: break doc.expandNode(node) date_seance = node.firstChild.nodeValue if len(date_seance) != 17: print("Bogus DateSeance?", date_seance) continue seance_context = {'DateSeance': date_seance} if event == END_ELEMENT: indent_level -= 2 if type(node) == Element and len(visited) > 0: old = visited.pop() del old if node.nodeName == 'texte' and seance_context is not None and 'DateSeance' in seance_context: doc.expandNode(node) if visited[-2].attributes and 'code_style' in visited[ -2].attributes and visited[-2].attributes[ 'code_style'].value == 'NORMAL': fullText = filter_numbers(recursive_text(node)) fullText = re.compile('\s+').sub(' ', fullText) try: seance_context[node.nodeName].append(fullText) except KeyError: seance_context[node.nodeName] = [fullText]
def get_added_content(url, revid, lang): """ Retrieves all content created by the contributor, except minor edits and derivative works like translations, content mixed with other contributors, or reverts The "url" parameter specifies the API's base url. The "revid" parameter specifies the ID of the revision to check and retrieve. The "lang" parameter specifies the code of the processed language (e.g. "en", "fr", etc.) """ #We want to compare the revision to the previous one, to see the content the contributor added (or not) compare_query = { "action": "compare", "fromrev": revid, "torelative": "prev", "prop": "rel|diffsize|size|diff|title", "format": "json" } # print(compare_query) response = requests.post(url, params=compare_query).json() if "compare" not in response.keys(): return None revid_size = response["compare"]["tosize"] if "prev" in response["compare"].keys( ): #If there are previous revisions, we need to check if the current revision isn't a derivative work (i.e. a revert) #Check if it's a revert rvcontinue = None revid_size = 0 current_size = response["compare"]["tosize"] previous_size = response["compare"]["fromsize"] if previous_size > current_size: return None while True: #Let's compare the current and previous revisions of the page pr_query = { "action": "query", "prop": "revisions", "rvprop": "ids|tags|size", "format": "json", # "revids":previous_revision_id "rvendid": revid, "titles": response["compare"]["totitle"] } #for retrieving a list of previous revisions until the current one if rvcontinue != None: pr_query["rvcontinue"] = rvcontinue pr_response = requests.post(url, pr_query).json() for page in pr_response["query"]["pages"]: #Check if the current revision is a revert. for revision in pr_response["query"]["pages"][page][ "revisions"]: # print(revision.keys()) if revision["revid"] == revid: revid_tags = revision["tags"] if "mw-rollback" in revid_tags: #Here, we're sure it's a revert return None continue #If this previous revision has the same size of the current revision, maybe the revision we want to retrieve is a revert. Let's be conservative, and consider it is. if revision["size"] == revid_size: return None if "continue" in pr_response.keys(): rvcontinue = pr_response["continue"]["rvcontinue"] else: break #Now, let's retrieve the revision content! raw_html = response["compare"]["*"] document = html.document_fromstring(raw_html) added_lines = document.xpath("//td[@class='diff-addedline']") # deleted_lines = document.xpath("//td[@class='diff-deletedline']") text_list = [] for td in added_lines: for div in td: if len( div.getchildren() ) > 0: #if there are children tags, it's because this is an inline modification, and not an addition -> skip it continue else: text = div.text_content() if "#REDIRECT" in text: return None try: #TODO: convert scales (1/25000, etc.) text = pypandoc.convert_text(text, to="plain", format="html").replace( "\r\n", " ") #to avoid removing relevant content in the {{lien}} template (French wikipedia) text = re.sub(r"{{lien\|([^}]+)}}", r"\1", text) text = pypandoc.convert_text(text, to="html", format="mediawiki").replace( "\r\n", " ") #and we retrieve the real plain text #TODO: add cleaning up of (), [], etc. text = html.document_fromstring(text) text = text.text_content() text = text.replace("\xa0", " ") #replacing by a space rather than by nothing, to ease the further string cleanup text = re.sub(r' \([^)]+\)', '', text) text = re.sub(r'\([^)]+\)', '', text) text = maybe_normalize(text) text = maybe_normalize(text, mapping=mapping_specific) text = re.sub( r'(\d)\s+(\d)', r'\1\2', text ) #In French, there's a space separation between thousand units. It isn't taken into account by num2words, so just let's remove those spaces. #TODO: need to internationalize this part below #converting latlon coordinates # text = re.sub(r'([0-9]+) ?°([0-9]+) ?\'([0-9]+) ?\"', r"\1 degrés \2 minutes \3 secondes", text) # text = re.sub(r'-(\d*\.\d+|\d+)', "moins \1", text) # for measure in measure_units: # text = re.sub(r'(\[0-1]\[,.]\d+|\[0-1]) ?{measure}'.format(measure=measure), r"\1 {full_name}".format(full_name=measure_units[measure]), text) # text = re.sub(r'(\d*\[,.]\d+|\d+) ?{measure}'.format(measure=measure), r"\1 {full_name}s".format(full_name=measure_units[measure]), text) # text = text.replace(" ?%", r" pour cent") #remove references between brackets text = re.sub(r'\[[0-9]+\]', '', text) #r'\[[0-9]+*\]' detected_lang = langid.classify(text)[0] if detected_lang != lang: continue #Transforming numbers in letters try: text = filter_numbers(text, lang=lang) except: pass text = text.strip() if is_garbage(text, lang) == True: # print("garbage:", text) continue # text = correct_sentence(text, lang) #TODO: uncomment except: continue #if pandoc cannot convert wikicode, there's a problem, and we don't want to retrieve malformed text if len(text.split()) > 3: #Let's not retrieve too short text text_list.append(text) return " ".join(text_list)
def parse_one_book(bookid): this_line = 0 has_title = False mainpage_marker = ' ' has_mainpage = False has_start_mainpage = False has_end_mainpage = False ebook = load_etext(bookid, refresh_cache=True, mirror=GUTENBERG_MIRROR).replace('\r\n', '\n') raw_text = remove_markup(strip_headers(ebook).strip()).split('\n') search_for_mainpage_marker = len( list(filter(lambda x: x.startswith(mainpage_marker), raw_text))) > 0 #print('search_for_mainpage_marker', search_for_mainpage_marker) finaltext = [] for line in raw_text: #print('LINE=="{}"'.format(line)) this_line += 1 if len(line) == 0: continue if not has_title: if (search_for_mainpage_marker and line.startswith(mainpage_marker)) or True: if line.isupper(): has_title = True #print('FOUND TITLE @', this_line, "'{}'".format(line)) continue if not has_mainpage: if not has_start_mainpage: if (search_for_mainpage_marker and line.startswith(mainpage_marker)) or True: has_start_mainpage = True #print('FOUND MAIN PAGE START @', this_line, "'{}'".format(line)) continue else: if (search_for_mainpage_marker and line.startswith(mainpage_marker)) or True: has_end_mainpage = True #print('FOUND MAIN PAGE END @', this_line, "'{}'".format(line)) else: continue has_mainpage = has_start_mainpage and has_end_mainpage if line.startswith(' '): #print('FOUND SOME EXTRA @', this_line, "'{}'".format(line)) continue if line.isupper(): #print('FOUND ONE CHAPTER @', this_line, "'{}'".format(line)) continue line = maybe_normalize(line) line = maybe_normalize(line, mapping=mapping_specific) line = filter_numbers(line).lstrip() maybe_matches = re.finditer(PUNCT_NBSP, line) for maybe_match in maybe_matches: line = line.replace( maybe_match.group(0), "%s\u00a0%s" % (maybe_match.group(1), maybe_match.group(2))) finaltext += [line] return finaltext
def get_article_texts(lang, revid_list): """Retrieves revisions specified in the "revid_lisst". The "lang" parameter specifies the Wikipedia version, e.g. "fr" To be used only with the first revision of articles originally created by the contributor. """ url = "https://{lang}.wikipedia.org/w/api.php".format(lang=lang) query = {"action":"parse", "format":"json" } text_list = [] for revid in revid_list: time.sleep(1) query["oldid"] = revid try: response = requests.post(url, data=query) except: time.sleep(30) response = requests.post(url, data=query) response = response.json() if "parse" not in response.keys(): #it's possible that the revision was since deleted, in this case there's nothing to parse continue raw_html = response["parse"]["text"]["*"] document = html.document_fromstring(raw_html) all_p = document.xpath("//p") for p in all_p: text = p.text_content() text = text.replace("\xa0", " ") #replacing by a space rather than by nothing, to ease the further string cleanup text = re.sub(r' \([^)]+\)', '', text) text = re.sub(r'\([^)]+\)', '', text) text = maybe_normalize(text) text = maybe_normalize(text, mapping=mapping_specific) text = re.sub(r'(\d)\s+(\d)', r'\1\2', text) #In French, there's a space separation between thousand units. It isn't taken into account by num2words, so just let's remove those spaces. #TODO: need to internationalize this part below #converting latlon coordinates # text = re.sub(r'([0-9]+) ?°([0-9]+) ?\'([0-9]+) ?\"', r"\1 degrés \2 minutes \3 secondes", text) ## text = re.sub(r'-(\d*\.\d+|\d+)', "moins \1", text) # for measure in measure_units: # text = re.sub(r'(\[0-1]\[,.]\d+|\[0-1]) ?{measure}'.format(measure=measure), r"\1 {full_name}".format(full_name=measure_units[measure]), text) # text = re.sub(r'(\d*\[,.]\d+|\d+) ?{measure}'.format(measure=measure), r"\1 {full_name}s".format(full_name=measure_units[measure]), text) # # text = re.sub(r'(\[0-1]\[,.]\d+|\[0-1]) ?°', r"\1 degré", text) # text = re.sub(r'(\d*\[,.]\d+|\d+) ?°', r"\1 degrés", text) # text = re.sub(r'(\d*\[,.]\d+|\d+) ?mm', r"\1 millimètres", text) # text = re.sub(r'(\d*\[,.]\d+|\d+) ?cm', r"\1 centimètres", text) # text = re.sub(r'(\d*\[,.]\d+|\d+) ?m[^a-z]', r"\1 mètres ", text) # text = re.sub(r'(\d*\[,.]\d+|\d+) ?km', r"\1 kilomètres", text) # text = text.replace(" ?%", r" pour cent") #remove references between brackets text = re.sub(r'\[[0-9]+\]', '', text) #r'\[[0-9]+*\]' #Transforming numbers in letters text = filter_numbers(text, lang=lang) text = text.strip() # text= " ".join([p.text_content().replace("\xa0", " ") for p in all_p]) if "\n" in text or is_garbage(text, lang) == True: text = "" if langid.classify(text)[0] != lang: text = "" # text = correct_sentence(text, lang) #TODO: uncomment # text = text.replace("%", "pour cent") if len(text.split()) > 3: #TODO: check content spelling # try: # matches = tool.check(text) # text = language_check.correct(text, matches) # except Exception as e: # print(text) # print("erreur correction : ", str(e)) # print(revid) # print("*"*20) text_list.append(text) return text_list