def process_operations(parent_revision_text, revision_text): parent_revision_text = parent_revision_text or '' revision_text = revision_text or '' a = wikitext_split.tokenize(parent_revision_text) b = wikitext_split.tokenize(revision_text) return [op for op in segment_matcher.diff(a, b)], a, b
def tokenize_common(): start = time.time() for _ in range(50): tokens = list(text_split.tokenize(common1)) print("\ttext_split: {0}".format((time.time() - start) / 50)) start = time.time() for _ in range(50): tokens = list(wikitext_split.tokenize(common1)) print("\twikitext_split: {0}".format((time.time() - start) / 50))
def tokenize_common(): start = time.time() for _ in range(50): tokens = list(text_split.tokenize(common1)) print("\ttext_split: {0}".format((time.time() - start)/50)) start = time.time() for _ in range(50): tokens = list(wikitext_split.tokenize(common1)) print("\twikitext_split: {0}".format((time.time() - start)/50))
for paragraph_or_whitespace in segments: if isinstance(paragraph_or_whitespace, MatchableSegment): paragraph = paragraph_or_whitespace # We have a paragraph for sentence_or_whitespace in paragraph: if isinstance(sentence_or_whitespace, MatchableSegment): sentence = sentence_or_whitespace # We have a sentence sentences.append(sentence) return sentences def my_strip_code(wikicode): return "".join(_my_strip_code(wikicode)) def _my_strip_code(wikicode): for node in wikicode.nodes: stripped = node.__strip__(normalize=True, collapse=True) if isinstance(node, Wikilink): stripped = stripped.split("|")[-1] if stripped is not None: yield str(stripped) tokens = wikitext_split.tokenize(text) sentences = process_sentences(segmenter.segment(tokens)) for sentence in sentences: raw_sentence = my_strip_code(mwp.parse(str(sentence).replace("\n", " "))) print(" *", raw_sentence)
sentences = [] for paragraph_or_whitespace in segments: if isinstance(paragraph_or_whitespace, MatchableSegment): paragraph = paragraph_or_whitespace # We have a paragraph for sentence_or_whitespace in paragraph: if isinstance(sentence_or_whitespace, MatchableSegment): sentence = sentence_or_whitespace # We have a sentence sentences.append(sentence) return sentences def my_strip_code(wikicode): return "".join(_my_strip_code(wikicode)) def _my_strip_code(wikicode): for node in wikicode.nodes: stripped = node.__strip__(normalize=True, collapse=True) if isinstance(node, Wikilink): stripped = stripped.split("|")[-1] if stripped is not None: yield str(stripped) tokens = wikitext_split.tokenize(text) sentences = process_sentences(segmenter.segment(tokens)) for sentence in sentences: raw_sentence = my_strip_code(mwp.parse(str(sentence).replace("\n", " "))) print(" *", raw_sentence)
def create_diff_for_revision(prev_rev_id, rev_id, db_filepath): # first, retrieve the text from the database try: db = get_db(db_filepath) cursor = db.execute("SELECT rev_id, content FROM revisionText WHERE rev_id = ?", (prev_rev_id,)) result = cursor.fetchall() if len(result) > 1: # This should never happen if the database is properly constructed... raise ValueError("WARNING: Duplicated rev_id in database, check integrity.") if len(result) == 0: raise ValueError(f"Failed to find rev_id {rev_id} in database.") result = result[0] prev_content = result['content'] cursor = db.execute("SELECT rev_id, content FROM revisionText WHERE rev_id = ?", (rev_id,)) result = cursor.fetchall() if len(result) > 1: # This should never happen if the database is properly constructed... raise ValueError("WARNING: Duplicated rev_id in database, check integrity.") if len(result) == 0: raise ValueError(f"Failed to find rev_id {rev_id} in database.") result = result[0] curr_content = result['content'] finally: db.close() # second, tokenize the texts prev_tokens = wikitext_split.tokenize(prev_content) curr_tokens = wikitext_split.tokenize(curr_content) # third, identify segments that were inserted and removed, tracking the tokens that were added and subtracted all_removed_tokens = [] all_inserted_tokens = [] delete_count = 0 insert_count = 0 for segment in segment_matcher.diff(prev_tokens, curr_tokens): if segment.name == 'equal': continue elif segment.name == 'delete': removed_tokens = prev_tokens[segment.a1:segment.a2] removed_tokens.insert(0, 'REMOVAL_START') removed_tokens.append('REMOVAL_END') all_removed_tokens.extend(removed_tokens) delete_count += 1 elif segment.name == 'insert': inserted_tokens = curr_tokens[segment.b1:segment.b2] inserted_tokens.insert(0, 'INSERTION_START') inserted_tokens.append('INSERTION_END') all_inserted_tokens.extend(inserted_tokens) insert_count += 1 else: raise ValueError('Substitutions are not implemented by the segment matcher.') content_token_count = len(curr_tokens) #if len(curr_tokens) >= 100000: # TODO consider avoiding writing out very long articles return {'prev_rev_id': prev_rev_id, 'rev_id': rev_id, 'delete_count': delete_count, 'insert_count': insert_count, 'content_token_count': content_token_count, 'content_tokens': curr_tokens, 'removed_tokens': all_removed_tokens, 'inserted_tokens': all_inserted_tokens}
import random import time import pickle from deltas import segment_matcher, sequence_matcher from deltas.segmenters import ParagraphsSentencesAndWhitespace from deltas.tokenizers import wikitext_split, text_split from mw import api segmenter = ParagraphsSentencesAndWhitespace() session = api.Session("https://en.wikipedia.org/w/api.php") common1 = session.revisions.get(638029546, properties={"content"})['*'] common2 = session.revisions.get(638077284, properties={"content"})['*'] common1_tokens = list(wikitext_split.tokenize(common1)) common2_tokens = list(wikitext_split.tokenize(common2)) words = [l.strip() for l in open('/usr/share/dict/words')] random1 = ''.join( random.choice(words) if t.type == "word" else str(t) for t in common1_tokens) random2 = ''.join( random.choice(words) if t.type == "word" else str(t) for t in common1_tokens) random2_tokens = list(wikitext_split.tokenize(random2)) random1_tokens = list(wikitext_split.tokenize(random1)) print("Tokenizing:")
def process_content_tokens(revision_content): return wikitext_split.tokenize(revision_content)
def process_tokens(revision_text): if revision_text is None: raise RevisionDocumentNotFound() return [t for t in wikitext_split.tokenize(revision_text)]
def process_tokens(revision_text): return [t for t in wikitext_split.tokenize(revision_text or '')]
import random import time import pickle from deltas import segment_matcher, sequence_matcher from deltas.segmenters import ParagraphsSentencesAndWhitespace from deltas.tokenizers import wikitext_split, text_split from mw import api segmenter = ParagraphsSentencesAndWhitespace() session = api.Session("https://en.wikipedia.org/w/api.php") common1 = session.revisions.get(638029546, properties={"content"})['*'] common2 = session.revisions.get(638077284, properties={"content"})['*'] common1_tokens = list(wikitext_split.tokenize(common1)) common2_tokens = list(wikitext_split.tokenize(common2)) words = [l.strip() for l in open('/usr/share/dict/words')] random1 = ''.join(random.choice(words) if t.type == "word" else str(t) for t in common1_tokens) random2 = ''.join(random.choice(words) if t.type == "word" else str(t) for t in common1_tokens) random2_tokens = list(wikitext_split.tokenize(random2)) random1_tokens = list(wikitext_split.tokenize(random1)) print("Tokenizing:") def tokenize_common(): start = time.time() for _ in range(50):
def process_tokens(revision_text): if revision_text is None: raise RevisionNotFound() return [t for t in wikitext_split.tokenize(revision_text)]