Exemplo n.º 1
0
def process_operations(parent_revision_text, revision_text):
    parent_revision_text = parent_revision_text or ''
    revision_text = revision_text or ''

    a = wikitext_split.tokenize(parent_revision_text)
    b = wikitext_split.tokenize(revision_text)

    return [op for op in segment_matcher.diff(a, b)], a, b
Exemplo n.º 2
0
def process_operations(parent_revision_text, revision_text):
    parent_revision_text = parent_revision_text or ''
    revision_text = revision_text or ''

    a = wikitext_split.tokenize(parent_revision_text)
    b = wikitext_split.tokenize(revision_text)

    return [op for op in segment_matcher.diff(a, b)], a, b
Exemplo n.º 3
0
def tokenize_common():
    start = time.time()
    for _ in range(50):
        tokens = list(text_split.tokenize(common1))
    print("\ttext_split: {0}".format((time.time() - start) / 50))

    start = time.time()
    for _ in range(50):
        tokens = list(wikitext_split.tokenize(common1))
    print("\twikitext_split: {0}".format((time.time() - start) / 50))
Exemplo n.º 4
0
def tokenize_common():
    start = time.time()
    for _ in range(50):
        tokens = list(text_split.tokenize(common1))
    print("\ttext_split: {0}".format((time.time() - start)/50))

    start = time.time()
    for _ in range(50):
        tokens = list(wikitext_split.tokenize(common1))
    print("\twikitext_split: {0}".format((time.time() - start)/50))
    for paragraph_or_whitespace in segments:
        if isinstance(paragraph_or_whitespace, MatchableSegment):
            paragraph = paragraph_or_whitespace  # We have a paragraph
            for sentence_or_whitespace in paragraph:
                if isinstance(sentence_or_whitespace, MatchableSegment):
                    sentence = sentence_or_whitespace  # We have a sentence
                    sentences.append(sentence)
    return sentences


def my_strip_code(wikicode):
    return "".join(_my_strip_code(wikicode))


def _my_strip_code(wikicode):

    for node in wikicode.nodes:
        stripped = node.__strip__(normalize=True, collapse=True)
        if isinstance(node, Wikilink):
            stripped = stripped.split("|")[-1]
        if stripped is not None:
            yield str(stripped)


tokens = wikitext_split.tokenize(text)
sentences = process_sentences(segmenter.segment(tokens))

for sentence in sentences:
    raw_sentence = my_strip_code(mwp.parse(str(sentence).replace("\n", "  ")))
    print("  *", raw_sentence)
Exemplo n.º 6
0
    sentences = []
    for paragraph_or_whitespace in segments:
        if isinstance(paragraph_or_whitespace, MatchableSegment):
            paragraph = paragraph_or_whitespace  # We have a paragraph
            for sentence_or_whitespace in paragraph:
                if isinstance(sentence_or_whitespace, MatchableSegment):
                    sentence = sentence_or_whitespace  # We have a sentence
                    sentences.append(sentence)
    return sentences


def my_strip_code(wikicode):
    return "".join(_my_strip_code(wikicode))


def _my_strip_code(wikicode):

    for node in wikicode.nodes:
        stripped = node.__strip__(normalize=True, collapse=True)
        if isinstance(node, Wikilink):
            stripped = stripped.split("|")[-1]
        if stripped is not None:
            yield str(stripped)

tokens = wikitext_split.tokenize(text)
sentences = process_sentences(segmenter.segment(tokens))

for sentence in sentences:
    raw_sentence = my_strip_code(mwp.parse(str(sentence).replace("\n", "  ")))
    print("  *", raw_sentence)
def create_diff_for_revision(prev_rev_id, rev_id, db_filepath):
    # first, retrieve the text from the database
    try:
        db = get_db(db_filepath)
        
        cursor = db.execute("SELECT rev_id, content FROM revisionText WHERE rev_id = ?", (prev_rev_id,))
        result = cursor.fetchall()
        if len(result) > 1:
            # This should never happen if the database is properly constructed...
            raise ValueError("WARNING: Duplicated rev_id in database, check integrity.")
        if len(result) == 0:
            raise ValueError(f"Failed to find rev_id {rev_id} in database.")
        result = result[0]
        prev_content = result['content']
        
        cursor = db.execute("SELECT rev_id, content FROM revisionText WHERE rev_id = ?", (rev_id,))
        result = cursor.fetchall()
        if len(result) > 1:
            # This should never happen if the database is properly constructed...
            raise ValueError("WARNING: Duplicated rev_id in database, check integrity.")
        if len(result) == 0:
            raise ValueError(f"Failed to find rev_id {rev_id} in database.")
        result = result[0]
        curr_content = result['content']
    finally:
        db.close()
    
    # second, tokenize the texts
    prev_tokens = wikitext_split.tokenize(prev_content)
    curr_tokens = wikitext_split.tokenize(curr_content)
    
    # third, identify segments that were inserted and removed, tracking the tokens that were added and subtracted
    all_removed_tokens = []
    all_inserted_tokens = []
    delete_count = 0
    insert_count = 0
    for segment in segment_matcher.diff(prev_tokens, curr_tokens):
        if segment.name == 'equal':
            continue
        elif segment.name == 'delete':
            removed_tokens = prev_tokens[segment.a1:segment.a2]
            removed_tokens.insert(0, 'REMOVAL_START')
            removed_tokens.append('REMOVAL_END')
            all_removed_tokens.extend(removed_tokens)
            delete_count += 1
        elif segment.name == 'insert':
            inserted_tokens = curr_tokens[segment.b1:segment.b2]
            inserted_tokens.insert(0, 'INSERTION_START')
            inserted_tokens.append('INSERTION_END')
            all_inserted_tokens.extend(inserted_tokens)
            insert_count += 1
        else:
            raise ValueError('Substitutions are not implemented by the segment matcher.')
    content_token_count = len(curr_tokens)
    #if len(curr_tokens) >= 100000: # TODO consider avoiding writing out very long articles
    return {'prev_rev_id': prev_rev_id, 
            'rev_id': rev_id,
            'delete_count': delete_count,
            'insert_count': insert_count,
            'content_token_count': content_token_count,
            'content_tokens': curr_tokens,
            'removed_tokens': all_removed_tokens, 
            'inserted_tokens': all_inserted_tokens}
Exemplo n.º 8
0
import random
import time
import pickle

from deltas import segment_matcher, sequence_matcher
from deltas.segmenters import ParagraphsSentencesAndWhitespace
from deltas.tokenizers import wikitext_split, text_split
from mw import api

segmenter = ParagraphsSentencesAndWhitespace()

session = api.Session("https://en.wikipedia.org/w/api.php")
common1 = session.revisions.get(638029546, properties={"content"})['*']
common2 = session.revisions.get(638077284, properties={"content"})['*']

common1_tokens = list(wikitext_split.tokenize(common1))
common2_tokens = list(wikitext_split.tokenize(common2))

words = [l.strip() for l in open('/usr/share/dict/words')]
random1 = ''.join(
    random.choice(words) if t.type == "word" else str(t)
    for t in common1_tokens)
random2 = ''.join(
    random.choice(words) if t.type == "word" else str(t)
    for t in common1_tokens)

random2_tokens = list(wikitext_split.tokenize(random2))
random1_tokens = list(wikitext_split.tokenize(random1))

print("Tokenizing:")
Exemplo n.º 9
0
def process_content_tokens(revision_content):
    return wikitext_split.tokenize(revision_content)
Exemplo n.º 10
0
def process_tokens(revision_text):
    if revision_text is None:
        raise RevisionDocumentNotFound()
    return [t for t in wikitext_split.tokenize(revision_text)]
Exemplo n.º 11
0
def process_tokens(revision_text):
    return [t for t in wikitext_split.tokenize(revision_text or '')]
Exemplo n.º 12
0
import random
import time
import pickle

from deltas import segment_matcher, sequence_matcher
from deltas.segmenters import ParagraphsSentencesAndWhitespace
from deltas.tokenizers import wikitext_split, text_split
from mw import api

segmenter = ParagraphsSentencesAndWhitespace()

session = api.Session("https://en.wikipedia.org/w/api.php")
common1 = session.revisions.get(638029546, properties={"content"})['*']
common2 = session.revisions.get(638077284, properties={"content"})['*']

common1_tokens = list(wikitext_split.tokenize(common1))
common2_tokens = list(wikitext_split.tokenize(common2))

words = [l.strip() for l in open('/usr/share/dict/words')]
random1 = ''.join(random.choice(words) if t.type == "word" else str(t)
                  for t in common1_tokens)
random2 = ''.join(random.choice(words) if t.type == "word" else str(t)
                  for t in common1_tokens)

random2_tokens = list(wikitext_split.tokenize(random2))
random1_tokens = list(wikitext_split.tokenize(random1))

print("Tokenizing:")
def tokenize_common():
    start = time.time()
    for _ in range(50):
Exemplo n.º 13
0
def process_tokens(revision_text):
    return [t for t in wikitext_split.tokenize(revision_text or '')]
Exemplo n.º 14
0
def process_content_tokens(revision_content):
    return wikitext_split.tokenize(revision_content)
Exemplo n.º 15
0
def process_tokens(revision_text):
    if revision_text is None:
        raise RevisionNotFound()
    return [t for t in wikitext_split.tokenize(revision_text)]