Exemplo n.º 1
0
    for _ in range(25):
        operations = list(processor.process(common1))
        operations = list(processor.process(common2))
    print("\tcommon_fast: {0}".format((time.time() - start)/50))
segment_common_fast()
#profile.run('segment_common()', sort="cumulative")

def segment_random():
    start = time.time()
    for _ in range(25):
        operations = list(segment_matcher.diff(random1_tokens, random2_tokens))
    print("\trandom: {0}".format((time.time() - start)/25))
#segment_random()
#profile.run('segment_random()', sort="cumulative")

common1_segments = segmenter.segment(common1_tokens)
common2_segments = segmenter.segment(common2_tokens)
random1_segments = segmenter.segment(random1_tokens)
random2_segments = segmenter.segment(random2_tokens)

print("Running segment matcher (post segmentation):")
def segment_common_seg():
    start = time.time()
    for _ in range(25):
        operations = list(segment_matcher.diff_segments(common1_segments, common2_segments))
    print("\tcommon: {0}".format((time.time() - start)/25))
segment_common_seg()
#profile.run('segment_common_seg()', sort="cumulative")

def segment_random_seg():
    start = time.time()
Exemplo n.º 2
0
    sentences = []
    for paragraph_or_whitespace in segments:
        if isinstance(paragraph_or_whitespace, MatchableSegment):
            paragraph = paragraph_or_whitespace  # We have a paragraph
            for sentence_or_whitespace in paragraph:
                if isinstance(sentence_or_whitespace, MatchableSegment):
                    sentence = sentence_or_whitespace  # We have a sentence
                    sentences.append(sentence)
    return sentences


def my_strip_code(wikicode):
    return "".join(_my_strip_code(wikicode))


def _my_strip_code(wikicode):

    for node in wikicode.nodes:
        stripped = node.__strip__(normalize=True, collapse=True)
        if isinstance(node, Wikilink):
            stripped = stripped.split("|")[-1]
        if stripped is not None:
            yield str(stripped)

tokens = wikitext_split.tokenize(text)
sentences = process_sentences(segmenter.segment(tokens))

for sentence in sentences:
    raw_sentence = my_strip_code(mwp.parse(str(sentence).replace("\n", "  ")))
    print("  *", raw_sentence)
    for paragraph_or_whitespace in segments:
        if isinstance(paragraph_or_whitespace, MatchableSegment):
            paragraph = paragraph_or_whitespace  # We have a paragraph
            for sentence_or_whitespace in paragraph:
                if isinstance(sentence_or_whitespace, MatchableSegment):
                    sentence = sentence_or_whitespace  # We have a sentence
                    sentences.append(sentence)
    return sentences


def my_strip_code(wikicode):
    return "".join(_my_strip_code(wikicode))


def _my_strip_code(wikicode):

    for node in wikicode.nodes:
        stripped = node.__strip__(normalize=True, collapse=True)
        if isinstance(node, Wikilink):
            stripped = stripped.split("|")[-1]
        if stripped is not None:
            yield str(stripped)


tokens = wikitext_split.tokenize(text)
sentences = process_sentences(segmenter.segment(tokens))

for sentence in sentences:
    raw_sentence = my_strip_code(mwp.parse(str(sentence).replace("\n", "  ")))
    print("  *", raw_sentence)
Exemplo n.º 4
0
segment_common_fast()
#profile.run('segment_common()', sort="cumulative")


def segment_random():
    start = time.time()
    for _ in range(25):
        operations = list(segment_matcher.diff(random1_tokens, random2_tokens))
    print("\trandom: {0}".format((time.time() - start) / 25))


#segment_random()
#profile.run('segment_random()', sort="cumulative")

common1_segments = segmenter.segment(common1_tokens)
common2_segments = segmenter.segment(common2_tokens)
random1_segments = segmenter.segment(random1_tokens)
random2_segments = segmenter.segment(random2_tokens)

print("Running segment matcher (post segmentation):")


def segment_common_seg():
    start = time.time()
    for _ in range(25):
        operations = list(
            segment_matcher.diff_segments(common1_segments, common2_segments))
    print("\tcommon: {0}".format((time.time() - start) / 25))