예제 #1
0
# for max_len heurisitc
import curses
from curses.ascii import isdigit
import nltk
from nltk.corpus import cmudict

import pyphen

d = cmudict.dict()


def nsyl(word):
    return [len(list(y for y in x if isdigit(y[-1]))) for x in d[word.lower()]]


dic = pyphen.Pyphen(lang='nl_NL')

ok_clusters = {'ch', 'sh', 'th'}


def max_pinyin_length(name):
    """
    heuristic for finding the maximum number of pinyin from an english name
    """
    num_syllables = 0
    try:
        num_syllables = nsyl(name)[0]
    except:
        pass
    hyphenated = dic.inserted(name).split('-')
    hyph_count = len(hyphenated)
예제 #2
0
파일: text.py 프로젝트: takis/WeasyPrint
def split_first_line(text, style, hinting, max_width, line_width):
    """Fit as much as possible in the available width for one line of text.

    Return ``(layout, length, resume_at, width, height, baseline)``.

    ``layout``: a pango Layout with the first line
    ``length``: length in UTF-8 bytes of the first line
    ``resume_at``: The number of UTF-8 bytes to skip for the next line.
                   May be ``None`` if the whole text fits in one line.
                   This may be greater than ``length`` in case of preserved
                   newline characters.
    ``width``: width in pixels of the first line
    ``height``: height in pixels of the first line
    ``baseline``: baseline in pixels of the first line

    """
    # In some cases (shrink-to-fit result being the preferred width)
    # this value is coming from Pango itself,
    # but floating point errors have accumulated:
    #   width2 = (width + X) - X   # in some cases, width2 < width
    # Increase the value a bit to compensate and not introduce
    # an unexpected line break.
    if max_width is not None:
        max_width *= 1.0001
    # Step #1: Get a draft layout with the first line
    layout = None
    if max_width:
        expected_length = int(max_width / style.font_size * 2.5)
        if expected_length < len(text):
            # Try to use a small amount of text instead of the whole text
            layout = create_layout(text[:expected_length], style, hinting,
                                   max_width)
            lines = layout.iter_lines()
            first_line = next(lines, None)
            second_line = next(lines, None)
            if second_line is None:
                # The small amount of text fits in one line, give up and use
                # the whole text
                layout = None
    if layout is None:
        layout = create_layout(text, style, hinting, max_width)
        lines = layout.iter_lines()
        first_line = next(lines, None)
        second_line = next(lines, None)
    resume_at = None if second_line is None else second_line.start_index

    # Step #2: Don't hyphenize when it's not needed
    if max_width is None:
        # The first line can take all the place needed
        return first_line_metrics(first_line, text, layout, resume_at)
    first_line_width, _height = get_size(first_line)
    if second_line is None and first_line_width <= max_width:
        # The first line fits in the available width
        return first_line_metrics(first_line, text, layout, resume_at)

    # Step #3: Try to put the first word of the second line on the first line
    if first_line_width <= max_width:
        # The first line may have been cut too early by Pango
        second_line_index = second_line.start_index
        first_part = utf8_slice(text, slice(second_line_index))
        second_part = utf8_slice(text, slice(second_line_index, None))
    else:
        # The first word is longer than the line, try to hyphenize it
        first_part = ''
        second_part = text
    next_word = second_part.split(' ', 1)[0]

    if not next_word:
        # We did not find a word on the next line
        return first_line_metrics(first_line, text, layout, resume_at)

    # next_word might fit without a space afterwards.
    # Pango previously counted that space’s advance width.
    new_first_line = first_part + next_word
    layout.set_text(new_first_line)
    lines = layout.iter_lines()
    first_line = next(lines, None)
    second_line = next(lines, None)
    first_line_width, _height = get_size(first_line)
    if second_line is None and first_line_width <= max_width:
        # The next word fits in the first line, keep the layout
        resume_at = len(new_first_line.encode('utf-8')) + 1
        return first_line_metrics(first_line, text, layout, resume_at)

    # Step #4: Try to hyphenize
    hyphens = style.hyphens
    lang = style.lang and pyphen.language_fallback(style.lang)
    total, left, right = style.hyphenate_limit_chars

    hyphenated = False

    # Automatic hyphenation possible and next word is long enough
    if hyphens not in ('none', 'manual') and lang and len(next_word) >= total:
        first_line_width, _height = get_size(first_line)
        space = max_width - first_line_width
        if style.hyphenate_limit_zone.unit == '%':
            limit_zone = max_width * style.hyphenate_limit_zone.value / 100.
        else:
            limit_zone = style.hyphenate_limit_zone.value

        if space > limit_zone or space < 0:
            # The next word does not fit, try hyphenation
            dictionary_key = (lang, left, right, total)
            dictionary = PYPHEN_DICTIONARY_CACHE.get(dictionary_key)
            if dictionary is None:
                dictionary = pyphen.Pyphen(lang=lang, left=left, right=right)
                PYPHEN_DICTIONARY_CACHE[dictionary_key] = dictionary
            for first_word_part, _ in dictionary.iterate(next_word):
                new_first_line = (first_part + first_word_part +
                                  style.hyphenate_character)
                temp_layout = create_layout(new_first_line, style, hinting,
                                            max_width)
                temp_lines = temp_layout.iter_lines()
                temp_first_line = next(temp_lines, None)
                temp_second_line = next(temp_lines, None)

                if (temp_second_line is None and space >= 0) or space < 0:
                    hyphenated = True
                    # TODO: find why there's no need to .encode
                    resume_at = len(first_part + first_word_part)
                    layout = temp_layout
                    first_line = temp_first_line
                    second_line = temp_second_line
                    temp_first_line_width, _height = get_size(temp_first_line)
                    if temp_first_line_width <= max_width:
                        break

    # Step 5: Try to break word if it's too long for the line
    overflow_wrap = style.overflow_wrap
    first_line_width, _height = get_size(first_line)
    space = max_width - first_line_width
    # If we can break words and the first line is too long
    if overflow_wrap == 'break-word' and space < 0:
        if hyphenated:
            # Is it really OK to remove hyphenation for word-break ?
            new_first_line = new_first_line.rstrip(
                new_first_line[-(len(style.hyphenate_character)):])
            if second_line is not None:
                second_line_index = second_line.start_index
                second_part = utf8_slice(text, slice(second_line_index, None))
                new_first_line += second_part
            hyphenated = False

        # TODO: Modify code to preserve W3C condition:
        # "Shaping characters are still shaped as if the word were not broken"
        # The way new lines are processed in this function (one by one with no
        # memory of the last) prevents shaping characters (arabic, for
        # instance) from keeping their shape when wrapped on the next line with
        # pango layout.  Maybe insert Unicode shaping characters in text ?
        temp_layout = create_layout(new_first_line, style, hinting, max_width)
        temp_layout.set_wrap(PANGO_WRAP_MODE['WRAP_WORD_CHAR'])
        temp_lines = temp_layout.iter_lines()
        temp_first_line = next(temp_lines, None)
        temp_second_line = next(temp_lines, None)
        temp_second_line_index = (len(new_first_line)
                                  if temp_second_line is None else
                                  temp_second_line.start_index)
        resume_at = temp_second_line_index
        first_part = utf8_slice(text, slice(temp_second_line_index))
        layout = create_layout(first_part, style, hinting, max_width)
        lines = layout.iter_lines()
        first_line = next(lines, None)

    return first_line_metrics(first_line, text, layout, resume_at, hyphenated)
예제 #3
0
gore = ['blood','flesh','bloody','bloodstained','mangled','liver','heart','brain','splatter','splattering','splattered',
        'carnage','slash','slashed','slashing','organ','slaughter','slaughtered','slaughtering'] #TODO good list
#cohesives = []
pronouns = ['I','my','me','mine','myself','you','your','yours','yourself','he','him','his','himself','she','her','hers','herself','it',
            'its','itself','they','them','theirs','yourselves','themselves']
causalverbs = ['make','made','cause','caused','allow','allowed','help','helped','have','had','enable','enabled','keep','kept',
               'hold','held','let','force','forced','require','required','making','causing','allowing','helping','having',
               'enabling','keeping','holding','letting','forcing','requiring']
causalparts = ['because','despite','resulting','thus','consequently','so','as','since']
hedgesndt = ['almost','maybe','somewhat','likely','barely','mildly','little','pretty','fairly']
amplifiers = ['completely','extremely','incredibly','quite','very','mostly','amazingly','really','definitely','exactly',
              'awfully']
negations = ['not','neither','nor','none','t','\'t','never','nobody','nowhere','no']
semper = ['seem','appear','seemed','appeared','seeming','appearing']
bow_list = [] #Total word list for bag of words features
dic = pyphen.Pyphen(lang='en')
print('Word lists created')
brownwords = FreqDist()
for sentence in brown.sents():
    for word in sentence:
        brownwords[word] += 1
print('Brown corpus loaded')

"""
FUNCTIONS
"""

def main():
    object = storydata(True)
    train_data = object.getTrain()
    test_data = object.getTest()
예제 #4
0
파일: test.py 프로젝트: mbr/Pyphen
def test_alternative():
    """Test the alternative parser."""
    dic = pyphen.Pyphen(lang='hu', left=1, right=1)
    assert tuple(dic.iterate('kulissza')) == (('kulisz', 'sza'), ('ku',
                                                                  'lissza'))
    assert dic.inserted('kulissza') == 'ku-lisz-sza'
 def get_number_syllables(self):
     dic = pyphen.Pyphen(lang='en')
     return sum([len(dic.inserted(word).split("-")) for word in self.words])
예제 #6
0
파일: test.py 프로젝트: mbr/Pyphen
def test_inserted():
    """Test the ``inserted`` method."""
    dic = pyphen.Pyphen(lang='nl_NL')
    assert dic.inserted('lettergrepen') == 'let-ter-gre-pen'
예제 #7
0
파일: test.py 프로젝트: mbr/Pyphen
def test_iterate():
    """Test the ``iterate`` method."""
    dic = pyphen.Pyphen(lang='nl_NL')
    assert tuple(dic.iterate('Amsterdam')) == (('Amster', 'dam'), ('Am',
                                                                   'sterdam'))
예제 #8
0
import re
from functools import partial
from subprocess import PIPE, Popen
from urllib.parse import urlparse

import bleach
import commonmark as cm
import html5lib
import pyphen
import smartypants
from django.apps import apps
from django.conf import settings
from django.core.exceptions import ImproperlyConfigured

hyphen_dict = pyphen.Pyphen(lang="en_US")


def insert_node_to_ast(tag, block, matchedobj):
    """Insert trivial block inside of given block in ast node."""
    target_eq = matchedobj.groups()[1]
    block.t = 'html_inline'
    return '<{tag}>{val}</{tag}>'.format(tag=tag, val=target_eq)


re_inlines_to_replace = (
    (re.compile(r'(~([^ ~]*)~)'), partial(insert_node_to_ast, 'sub')),
    (re.compile(r'(\^([^ \^]*)\^)'), partial(insert_node_to_ast, 'sup')),
)


def inject_subsup_tags(ast):
예제 #9
0
# coding=utf-8
import pyphen
import re

hyp = pyphen.Pyphen(lang='hu_HU')


def hyphenated(text):
    res = []
    for w in text.decode('utf-8').split(u' '):
        h = hyp.inserted(w, '&shy;')
        h = h.replace('cs&shy;cs', 'ccs')
        h = h.replace('dz&shy;dz', 'ddz')
        h = h.replace('dzs&shy;dzs', 'ddzs')
        h = h.replace('gy&shy;gy', 'ggy')
        h = h.replace('ly&shy;ly', 'lly')
        h = h.replace('ny&shy;ny', 'nny')
        h = h.replace('sz&shy;sz', 'ssz')
        h = h.replace('ty&shy;ty', 'tty')
        h = h.replace('zs&shy;zs', 'zzs')
        res.append(h)
    return ' '.join(res).encode('utf-8')


lines = []
prev = None
separator = False
heading = False
body = False
with file('a-tavoli-fa.html') as f:
    for l in f:
예제 #10
0
파일: parse.py 프로젝트: joe3141/NLP-labs
test = pd.DataFrame(
    np.hstack(
        (vectorizer.transform(X_test).toarray(), np.array(y_test)[:, None])))

train.to_csv("train.csv", header=False, index=False)
test.to_csv("test.csv", header=False, index=False)

spam_list = []
with open("spam_word_list.txt", "r") as f:
    spam_list = [
        word.strip().lower() for word in f.readlines() if word != "\n"
    ]

d = enchant.Dict("en_US")
pyphen.language_fallback('nl_NL_variant1')
dic = pyphen.Pyphen(lang='en_GB')


def extract_features(doc):
    doc = doc.lower()
    res = []
    tokens = word_tokenize(doc)
    sents = sent_tokenize(doc)
    # Number of sentences
    res.append(len(sents))

    # Number of verbs
    tags = pos_tag(tokens)
    counts = Counter(token[1] for token in tags)
    res.append(counts["VB"])
예제 #11
0
#-*- coding: utf-8 -*-

# Program to split words into syllables and calculate comprehensiveness' coefficient

# Declare titles:
titles = ['NCzas', 'Newsweek', 'Onet', 'Polityka', 'WPolityce']
import pyphen, csv, re, math

# Declare handler to consume words and split into syllables:
dic = pyphen.Pyphen(lang='pl_PL')

for title in titles:
    path = "C:\\Users\\Ilona\\PycharmProjects\\TestDataGenerator\\ArticleText\\" + title + "Text.txt"
    file = open(path)
    reader = csv.reader(file)

    # Declare pattern to remove all non-alphanumeric characters:
    pattern = re.compile('[\W_]+')
    sentLen = []
    articleText = []
    # Split into syllables:
    for line in reader:
        for item in line:
            sentences = item.split('.')
            for sentence in sentences:
                words = sentence.split(' ')
                words = list(filter(len, words))

                # Counter for counting words longer than 3 syllables:

                longerThan3 = 0
예제 #12
0
    def __init__(self, language=None, ablate=None, features_to_use=None):
        """
        Define basic properties

        Args:
            language(str): language of input data
            features_to_use: a list of string named features to use
        """
        # This dict contains all available features, along with a list of their
        # high-computing-power requirements e.g. ['spacy']
        feature_requirements = {
            # TODO: Actually fill in these requirements. At the moment, I'm just putting everything as all requirements.
            'is_nounphrase': ['spacy'],
            'len_tokens_norm': ['spacy'],
            'hypernym_count': None,
            'len_chars_norm': None,
            'len_tokens': None,
            'len_syllables': ['hyph'],
            'consonant_freq': None,
            'gr_or_lat': ['affix'],
            'is_capitalised': None,
            'num_complex_punct': None,
            'avg_chars_p_word': None,
            'sent_length': None,
            'unigram_prob': ['unigram_probs'],
            'char_n_gram_feats': None,
            'sent_n_gram_feats': None,
            'iob_tags': ['spacy'],
            'lemma_feats': ['spacy'],
            'bag_of_shapes': ['spacy'],
            'pos_tag_counts': ['spacy'],
            'NER_tag_counts': ['spacy'],
        }

        if features_to_use == None or features_to_use == 'all':
            features_to_use = list(feature_requirements.keys())

        # Total requirements is a unique list of all the requirements.
        self.total_requirements = set()
        final_features = []
        for feature in features_to_use:

            # Making sure that we know about the feature
            if feature in feature_requirements.keys():
                if feature_requirements[feature] is not None:
                    for requirement in feature_requirements[feature]:
                        self.total_requirements.add(requirement)
                final_features.append(feature)
            else:
                print(
                    "{} did not match any of the features in feature_requirements, so was not used."
                    .format(feature))

        self.features_to_use = final_features

        self.affixes = {}
        self.spacy_models = {
            'english': None,
            'spanish': None,
            'german': None,
            'french': None
        }
        self.hyph_dictionaries = {
            'english': None,
            'spanish': None,
            'german': None,
            'french': None
        }
        self.unigram_prob_dict = {
            'english': None,
            'spanish': None,
            'german': None,
            'french': None
        }

        # So that we're only opening this file once.
        if 'affix' in self.total_requirements:
            self.affixes = affix_features.get_affixes()

        if language == 'english':
            if 'spacy' in self.total_requirements:
                self.spacy_models = {'english': spacy.load('en_core_web_lg')}

            if 'hyph' in self.total_requirements:
                self.hyph_dictionaries = {'english': pyphen.Pyphen(lang='en')}

            if 'unigram_probs' in self.total_requirements:
                self.unigram_prob_dict = {
                    'english':
                    file_io.read_file('data/external/english_u_prob.csv')
                }

        elif language == 'spanish':
            if 'spacy' in self.total_requirements:
                self.spacy_models = {'spanish': spacy.load('es_core_news_md')}

            if 'hyph' in self.total_requirements:
                self.hyph_dictionaries = {'spanish': pyphen.Pyphen(lang='es')}

            if 'unigram_probs' in self.total_requirements:
                self.unigram_prob_dict = {
                    'spanish':
                    file_io.read_file('data/external/spanish_u_prob.csv')
                }

        elif language == 'german':
            if 'spacy' in self.total_requirements:
                self.spacy_models = {'german': spacy.load('de_core_news_sm')}

            if 'hyph' in self.total_requirements:
                self.hyph_dictionaries = {'german': pyphen.Pyphen(lang='de')}

            if 'unigram_probs' in self.total_requirements:
                self.unigram_prob_dict = {
                    'german':
                    file_io.read_file('data/external/german_u_prob.csv')
                }

        elif language == 'french':
            if 'spacy' in self.total_requirements:
                self.spacy_models = {'french': spacy.load('fr_core_news_md')}

            if 'hyph' in self.total_requirements:
                self.hyph_dictionaries = {'french': pyphen.Pyphen(lang='fr')}

            if 'unigram_probs' in self.total_requirements:
                self.unigram_prob_dict = {
                    'french':
                    file_io.read_file('data/external/french_u_prob.csv')
                }

        else:
            if 'spacy' in self.total_requirements:
                self.spacy_models = {
                    'english': spacy.load('en_core_web_lg'),
                    'spanish': spacy.load("es_core_news_md"),
                    'german': spacy.load('de_core_news_sm'),
                    'french': spacy.load('fr_core_news_md')
                }

            if 'hyph' in self.total_requirements:
                self.hyph_dictionaries = {
                    'english': pyphen.Pyphen(lang='en'),
                    'spanish': pyphen.Pyphen(lang='es'),
                    'german': pyphen.Pyphen(lang='de'),
                    'french': pyphen.Pyphen(lang='fr')
                }

            if 'unigram_probs' in self.total_requirements:
                self.unigram_prob_dict = {
                    'english':
                    file_io.read_file('data/external/english_u_prob.csv'),
                    'spanish':
                    file_io.read_file('data/external/spanish_u_prob.csv'),
                    'german':
                    file_io.read_file('data/external/german_u_prob.csv'),
                    'french':
                    file_io.read_file('data/external/french_u_prob.csv')
                }

        self.ablate = ablate
예제 #13
0
파일: text.py 프로젝트: iwschris/WeasyPrint
def split_first_line(text, style, hinting, max_width, line_width):
    """Fit as much as possible in the available width for one line of text.

    Return ``(layout, length, resume_at, width, height, baseline)``.

    ``layout``: a pango Layout with the first line
    ``length``: length in UTF-8 bytes of the first line
    ``resume_at``: The number of UTF-8 bytes to skip for the next line.
                   May be ``None`` if the whole text fits in one line.
                   This may be greater than ``length`` in case of preserved
                   newline characters.
    ``width``: width in pixels of the first line
    ``height``: height in pixels of the first line
    ``baseline``: baseline in pixels of the first line

    """
    # In some cases (shrink-to-fit result being the preferred width)
    # this value is coming from Pango itself,
    # but floating point errors have accumulated:
    #   width2 = (width + X) - X   # in some cases, width2 < width
    # Increase the value a bit to compensate and not introduce
    # an unexpected line break.
    if max_width is not None:
        max_width *= 1.0001
    # Step #1: Get a draft layout with the first line
    layout = None
    if max_width:
        expected_length = int(max_width / style.font_size * 2.5)
        if expected_length < len(text):
            # Try to use a small amount of text instead of the whole text
            layout = create_layout(text[:expected_length], style, hinting,
                                   max_width)
            lines = layout.iter_lines()
            first_line = next(lines, None)
            second_line = next(lines, None)
            if second_line is None:
                # The small amount of text fits in one line, give up and use
                # the whole text
                layout = None
    if layout is None:
        layout = create_layout(text, style, hinting, max_width)
        lines = layout.iter_lines()
        first_line = next(lines, None)
        second_line = next(lines, None)
    resume_at = None if second_line is None else second_line.start_index

    # Step #2: Don't hyphenize when it's not needed
    if max_width is None:
        # The first line can take all the place needed
        return first_line_metrics(first_line, text, layout, resume_at)
    first_line_width, _height = get_size(first_line)
    if second_line is None and first_line_width <= max_width:
        # The first line fits in the available width
        return first_line_metrics(first_line, text, layout, resume_at)

    # Step #3: Try to put the first word of the second line on the first line
    if first_line_width <= max_width:
        # The first line may have been cut too early by Pango
        second_line_index = second_line.start_index
        first_part = utf8_slice(text, slice(second_line_index))
        second_part = utf8_slice(text, slice(second_line_index, None))
    else:
        # The first word is longer than the line, try to hyphenize it
        first_part = ''
        second_part = text
    next_word = second_part.split(' ', 1)[0]

    if not next_word:
        # We did not find a word on the next line
        return first_line_metrics(first_line, text, layout, resume_at)

    # next_word might fit without a space afterwards.
    # Pango previously counted that space’s advance width.
    new_first_line = first_part + next_word
    layout.set_text(new_first_line)
    lines = layout.iter_lines()
    first_line = next(lines, None)
    second_line = next(lines, None)
    first_line_width, _height = get_size(first_line)
    if second_line is None and first_line_width <= max_width:
        # The next word fits in the first line, keep the layout
        resume_at = len(new_first_line.encode('utf-8')) + 1
        return first_line_metrics(first_line, text, layout, resume_at)

    # Step #4: Try to hyphenize
    hyphens = style.hyphens
    lang = style.lang
    total, left, right = style.hyphenate_limit_chars
    if hyphens in ('none', 'manual') or lang not in pyphen.LANGUAGES:
        # No automatic hyphenation
        return first_line_metrics(first_line, text, layout, resume_at)
    elif len(next_word) < total:
        # Next word is too small
        return first_line_metrics(first_line, text, layout, resume_at)

    first_line_width, _height = get_size(first_line)
    space = max_width - first_line_width
    if style.hyphenate_limit_zone.unit == '%':
        limit_zone = max_width * style.hyphenate_limit_zone.value / 100.
    else:
        limit_zone = style.hyphenate_limit_zone.value

    hyphenated = False
    if space > limit_zone or space < 0:
        # The next word does not fit, try hyphenation
        dictionary_key = (lang, left, right, total)
        dictionary = PYPHEN_DICTIONARY_CACHE.get(dictionary_key)
        if dictionary is None:
            dictionary = pyphen.Pyphen(lang=lang, left=left, right=right)
            PYPHEN_DICTIONARY_CACHE[dictionary_key] = dictionary
        for first_word_part, _ in dictionary.iterate(next_word):
            new_first_line = (first_part + first_word_part +
                              style.hyphenate_character)
            temp_layout = create_layout(new_first_line, style, hinting,
                                        max_width)
            temp_lines = temp_layout.iter_lines()
            temp_first_line = next(temp_lines, None)
            temp_second_line = next(temp_lines, None)
            if (temp_second_line is None and space >= 0) or space < 0:
                hyphenated = True
                # TODO: find why there's no need to .encode
                resume_at = len(first_part + first_word_part)
                layout = temp_layout
                first_line = temp_first_line
                second_line = temp_second_line
                temp_first_line_width, _height = get_size(temp_first_line)
                if temp_first_line_width <= max_width:
                    break
    return first_line_metrics(first_line, text, layout, resume_at, hyphenated)
예제 #14
0
def split_first_line(text, style, context, max_width, line_width):
    """Fit as much as possible in the available width for one line of text.

    Return ``(layout, length, resume_at, width, height, baseline)``.

    ``layout``: a pango Layout with the first line
    ``length``: length in UTF-8 bytes of the first line
    ``resume_at``: The number of UTF-8 bytes to skip for the next line.
                   May be ``None`` if the whole text fits in one line.
                   This may be greater than ``length`` in case of preserved
                   newline characters.
    ``width``: width in pixels of the first line
    ``height``: height in pixels of the first line
    ``baseline``: baseline in pixels of the first line

    """
    text_wrap = style.white_space in ('pre', 'nowrap')
    space_collapse = style.white_space in ('normal', 'nowrap', 'pre-line')

    if text_wrap:
        max_width = None
    elif max_width is not None:
        # In some cases (shrink-to-fit result being the preferred width)
        # this value is coming from Pango itself,
        # but floating point errors have accumulated:
        #   width2 = (width + X) - X   # in some cases, width2 < width
        # Increase the value a bit to compensate and not introduce
        # an unexpected line break. The 1e-9 value comes from PEP 485.
        max_width *= 1 + 1e-9

    # Step #1: Get a draft layout with the first line
    layout = create_layout(text, style, context, max_width)
    lines = layout.iter_lines()
    first_line = next(lines, None)
    second_line = next(lines, None)
    resume_at = None if second_line is None else second_line.start_index

    # Step #2: Don't hyphenize when it's not needed
    if max_width is None:
        # The first line can take all the place needed
        return first_line_metrics(
            first_line, text, layout, resume_at, space_collapse, style)
    first_line_width, _ = get_size(first_line, style)
    if second_line is None and first_line_width <= max_width:
        # The first line fits in the available width
        return first_line_metrics(
            first_line, text, layout, resume_at, space_collapse, style)

    # Step #3: Try to put the first word of the second line on the first line
    if first_line_width <= max_width:
        # The first line may have been cut too early by Pango
        second_line_index = second_line.start_index
        first_line_text = utf8_slice(text, slice(second_line_index))
        second_line_text = utf8_slice(text, slice(second_line_index, None))
    else:
        # The first word is longer than the line, try to hyphenize it
        first_line_text = ''
        second_line_text = text

    next_word = second_line_text.split(' ', 1)[0]
    if next_word:
        if space_collapse:
            # next_word might fit without a space afterwards
            # only try when space collapsing is allowed
            new_first_line_text = first_line_text + next_word
            layout.set_text(new_first_line_text)
            lines = layout.iter_lines()
            first_line = next(lines, None)
            second_line = next(lines, None)
            first_line_width, _ = get_size(first_line, style)
            if second_line is None and first_line_text:
                # The next word fits in the first line, keep the layout
                resume_at = len(new_first_line_text.encode('utf-8')) + 1
                if resume_at == len(text.encode('utf-8')):
                    resume_at = None
                return first_line_metrics(
                    first_line, text, layout, resume_at, space_collapse, style)
            elif second_line:
                # Text may have been split elsewhere by Pango earlier
                resume_at = second_line.start_index
            else:
                resume_at = first_line.length + 1
    elif first_line_text:
        # We found something on the first line but we did not find a word on
        # the next line, no need to hyphenate, we can keep the current layout
        return first_line_metrics(
            first_line, text, layout, resume_at, space_collapse, style)

    # Step #4: Try to hyphenize
    hyphens = style.hyphens
    lang = style.lang and pyphen.language_fallback(style.lang)
    total, left, right = style.hyphenate_limit_chars
    hyphenated = False
    soft_hyphen = u'\u00ad'

    # Automatic hyphenation possible and next word is long enough
    if hyphens != 'none' and len(next_word) >= total:
        first_line_width, _ = get_size(first_line, style)
        space = max_width - first_line_width
        if style.hyphenate_limit_zone.unit == '%':
            limit_zone = max_width * style.hyphenate_limit_zone.value / 100.
        else:
            limit_zone = style.hyphenate_limit_zone.value

        if space > limit_zone or space < 0:
            # Manual hyphenation: check that the line ends with a soft hyphen
            # and add the missing hyphen
            if hyphens == 'manual':
                if first_line_text.endswith(soft_hyphen):
                    # The first line has been split on a soft hyphen
                    if u' ' in first_line_text:
                        first_line_text, next_word = (
                            first_line_text.rsplit(u' ', 1))
                        next_word = u' ' + next_word
                        layout.set_text(first_line_text)
                        lines = layout.iter_lines()
                        first_line = next(lines, None)
                        second_line = next(lines, None)
                        resume_at = len(
                            (first_line_text + u' ').encode('utf8'))
                    else:
                        first_line_text, next_word = u'', first_line_text
                soft_hyphen_indexes = [
                    match.start() for match in
                    re.finditer(soft_hyphen, next_word)]
                soft_hyphen_indexes.reverse()
                dictionary_iterations = [
                    next_word[:i + 1] for i in soft_hyphen_indexes]
            elif hyphens == 'auto' and lang:
                # The next word does not fit, try hyphenation
                dictionary_key = (lang, left, right, total)
                dictionary = PYPHEN_DICTIONARY_CACHE.get(dictionary_key)
                if dictionary is None:
                    dictionary = pyphen.Pyphen(
                        lang=lang, left=left, right=right)
                    PYPHEN_DICTIONARY_CACHE[dictionary_key] = dictionary
                dictionary_iterations = [
                    start for start, end in dictionary.iterate(next_word)]
            else:
                dictionary_iterations = []

            if dictionary_iterations:
                for first_word_part in dictionary_iterations:
                    new_first_line_text = first_line_text + first_word_part
                    hyphenated_first_line_text = (
                        new_first_line_text + style.hyphenate_character)
                    new_layout = create_layout(
                        hyphenated_first_line_text, style, context, max_width)
                    new_lines = new_layout.iter_lines()
                    new_first_line = next(new_lines, None)
                    new_second_line = next(new_lines, None)
                    new_first_line_width, _ = get_size(new_first_line, style)
                    new_space = max_width - new_first_line_width
                    if new_second_line is None and (
                            new_space >= 0 or
                            first_word_part == dictionary_iterations[-1]):
                        hyphenated = True
                        layout = new_layout
                        first_line = new_first_line
                        second_line = new_second_line
                        resume_at = len(new_first_line_text.encode('utf8'))
                        if text[len(new_first_line_text)] == soft_hyphen:
                            resume_at += len(soft_hyphen.encode('utf8'))
                        break

                if not hyphenated and not first_line_text:
                    # Recreate the layout with no max_width to be sure that
                    # we don't break inside the hyphenate-character string
                    hyphenated = True
                    layout.set_text(hyphenated_first_line_text)
                    pango.pango_layout_set_width(
                        layout.layout, units_from_double(-1))
                    lines = layout.iter_lines()
                    first_line = next(lines, None)
                    second_line = next(lines, None)
                    resume_at = len(new_first_line_text.encode('utf8'))
                    if text[len(first_line_text)] == soft_hyphen:
                        resume_at += len(soft_hyphen.encode('utf8'))

    if not hyphenated and first_line_text.endswith(soft_hyphen):
        # Recreate the layout with no max_width to be sure that
        # we don't break inside the hyphenate-character string
        hyphenated = True
        hyphenated_first_line_text = (
            first_line_text + style.hyphenate_character)
        layout.set_text(hyphenated_first_line_text)
        pango.pango_layout_set_width(
            layout.layout, units_from_double(-1))
        lines = layout.iter_lines()
        first_line = next(lines, None)
        second_line = next(lines, None)
        resume_at = len(first_line_text.encode('utf8'))

    # Step 5: Try to break word if it's too long for the line
    overflow_wrap = style.overflow_wrap
    first_line_width, _ = get_size(first_line, style)
    space = max_width - first_line_width
    # If we can break words and the first line is too long
    if overflow_wrap == 'break-word' and space < 0:
        # Is it really OK to remove hyphenation for word-break ?
        hyphenated = False
        # TODO: Modify code to preserve W3C condition:
        # "Shaping characters are still shaped as if the word were not broken"
        # The way new lines are processed in this function (one by one with no
        # memory of the last) prevents shaping characters (arabic, for
        # instance) from keeping their shape when wrapped on the next line with
        # pango layout.  Maybe insert Unicode shaping characters in text ?
        layout.set_text(text)
        pango.pango_layout_set_width(
            layout.layout, units_from_double(max_width))
        layout.set_wrap(PANGO_WRAP_MODE['WRAP_WORD_CHAR'])
        temp_lines = layout.iter_lines()
        next(temp_lines, None)
        temp_second_line = next(temp_lines, None)
        temp_second_line_index = (
            len(text.encode('utf-8')) if temp_second_line is None
            else temp_second_line.start_index)
        resume_at = temp_second_line_index
        first_line_text = utf8_slice(text, slice(temp_second_line_index))
        layout.set_text(first_line_text)
        lines = layout.iter_lines()
        first_line = next(lines, None)

    return first_line_metrics(
        first_line, text, layout, resume_at, space_collapse, style, hyphenated,
        style.hyphenate_character)
예제 #15
0
파일: test.py 프로젝트: mbr/Pyphen
def test_upper_alternative():
    """Test uppercase with alternative parser."""
    dic = pyphen.Pyphen(lang='hu', left=1, right=1)
    assert tuple(dic.iterate('KULISSZA')) == (('KULISZ', 'SZA'), ('KU',
                                                                  'LISSZA'))
    assert dic.inserted('KULISSZA') == 'KU-LISZ-SZA'
예제 #16
0
import pyphen

dic = pyphen.Pyphen(lang='en_US')


def insert_soft_hyphens(text, hyphen='\xad'):
    """Insert the hyphen in breaking pointsaccording to the dictionary.
    
    '\xad' is the Soft Hyphen (SHY) character
    """
    lines = []
    for line in text.splitlines():
        hyph_words = [dic.inserted(word, hyphen) for word in line.split()]
        lines.append(' '.join(hyph_words))
    return '\n'.join(lines)
예제 #17
0
파일: test.py 프로젝트: mbr/Pyphen
def test_all_dictionaries():
    """Test that all included dictionaries can be parsed."""
    for lang in pyphen.LANGUAGES:
        pyphen.Pyphen(lang=lang)
예제 #18
0
import os, re, numpy as np, pandas as pd, json
import pyphen
from collections import Counter
from collections import defaultdict

dic = pyphen.Pyphen(lang='de_DE')

df = pd.read_csv('german_viz/data/nouns.csv', encoding='utf-8-sig')[['lemma','genus','suffix']]

print(df)
nouns = df['lemma'].values
genders =df['genus'].values
suffixgenders = genders[:34]
all_suffixs = df['suffix'].values.tolist()
suffixs = df['suffix'].values.tolist()[:34]

suffix_dict_freq = defaultdict(int)
for i in range(len(all_suffixs)):
    suffix_dict_freq[all_suffixs[i]] += 1
suffix_dict_freq
links, nodes = [], []

gender_freq = defaultdict(int)
for i in range(len(genders)):
    gender_freq[genders[i]] += 1
gender_freq

mas = {'name': 'masculine',
       'freq': gender_freq['m'],
           'i': -1
       }
예제 #19
0
파일: test.py 프로젝트: mbr/Pyphen
def test_wrap():
    """Test the ``wrap`` method."""
    dic = pyphen.Pyphen(lang='nl_NL')
    assert dic.wrap('autobandventieldopje',
                    11) == ('autoband-', 'ventieldopje')
예제 #20
0
def test_fallback_dict():
    """Test the ``iterate`` method with a fallback dict."""
    dic = pyphen.Pyphen(lang='nl_NL-variant')
    assert tuple(dic.iterate('Amsterdam')) == (('Amster', 'dam'), ('Am',
                                                                   'sterdam'))
예제 #21
0
파일: test.py 프로젝트: mbr/Pyphen
def test_filename():
    """Test the ``filename`` parameter."""
    dic = pyphen.Pyphen(filename=pyphen.LANGUAGES['nl_NL'])
    assert dic.inserted('lettergrepen') == 'let-ter-gre-pen'
예제 #22
0
def count_syllables(phrase):
    dic = pyphen.Pyphen(lang='en')
    return len(dic.inserted(phrase).replace('-', ' ').split())
예제 #23
0
파일: test.py 프로젝트: mbr/Pyphen
def test_upper():
    """Test uppercase."""
    dic = pyphen.Pyphen(lang='nl_NL')
    assert dic.inserted('LETTERGREPEN') == 'LET-TER-GRE-PEN'
예제 #24
0
def split_first_line(text, style, context, max_width, line_width,
                     justification_spacing):
    """Fit as much as possible in the available width for one line of text.

    Return ``(layout, length, resume_at, width, height, baseline)``.

    ``layout``: a pango Layout with the first line
    ``length``: length in UTF-8 bytes of the first line
    ``resume_at``: The number of UTF-8 bytes to skip for the next line.
                   May be ``None`` if the whole text fits in one line.
                   This may be greater than ``length`` in case of preserved
                   newline characters.
    ``width``: width in pixels of the first line
    ``height``: height in pixels of the first line
    ``baseline``: baseline in pixels of the first line

    """
    # See https://www.w3.org/TR/css-text-3/#white-space-property
    text_wrap = style.white_space in ('normal', 'pre-wrap', 'pre-line')
    space_collapse = style.white_space in ('normal', 'nowrap', 'pre-line')

    if not text_wrap:
        max_width = None

    # Step #1: Get a draft layout with the first line
    layout = None
    if max_width is not None and max_width != float('inf'):
        expected_length = int(max_width / style.font_size * 2.5)
        if expected_length < len(text):
            # Try to use a small amount of text instead of the whole text
            layout = create_layout(text[:expected_length], style, context,
                                   max_width, justification_spacing)
            lines = layout.iter_lines()
            first_line = next(lines, None)
            second_line = next(lines, None)
            if second_line is None:
                # The small amount of text fits in one line, give up and use
                # the whole text
                layout = None
    if layout is None:
        layout = create_layout(text, style, context, max_width,
                               justification_spacing)
        lines = layout.iter_lines()
        first_line = next(lines, None)
        second_line = next(lines, None)
    resume_at = None if second_line is None else second_line.start_index

    # Step #2: Don't hyphenize when it's not needed
    if max_width is None:
        # The first line can take all the place needed
        return first_line_metrics(first_line, text, layout, resume_at,
                                  space_collapse, style)
    first_line_width, _ = get_size(first_line, style)
    if second_line is None and first_line_width <= max_width:
        # The first line fits in the available width
        return first_line_metrics(first_line, text, layout, resume_at,
                                  space_collapse, style)

    # Step #3: Try to put the first word of the second line on the first line
    # https://mail.gnome.org/archives/gtk-i18n-list/2013-September/msg00006
    # is a good thread related to this problem.
    if first_line_width <= max_width:
        # The first line may have been cut too early by Pango
        second_line_index = second_line.start_index
        first_line_text = utf8_slice(text, slice(second_line_index))
        second_line_text = utf8_slice(text, slice(second_line_index, None))
    else:
        # The first word is longer than the line, try to hyphenize it
        first_line_text = ''
        second_line_text = text

    next_word = second_line_text.split(' ', 1)[0]
    if next_word:
        if space_collapse:
            # next_word might fit without a space afterwards
            # only try when space collapsing is allowed
            new_first_line_text = first_line_text + next_word
            layout.set_text(new_first_line_text)
            lines = layout.iter_lines()
            first_line = next(lines, None)
            second_line = next(lines, None)
            first_line_width, _ = get_size(first_line, style)
            if second_line is None and first_line_text:
                # The next word fits in the first line, keep the layout
                resume_at = len(new_first_line_text.encode('utf-8')) + 1
                if resume_at == len(text.encode('utf-8')):
                    resume_at = None
                return first_line_metrics(first_line, text, layout, resume_at,
                                          space_collapse, style)
            elif second_line:
                # Text may have been split elsewhere by Pango earlier
                resume_at = second_line.start_index
            else:
                resume_at = first_line.length + 1
    elif first_line_text:
        # We found something on the first line but we did not find a word on
        # the next line, no need to hyphenate, we can keep the current layout
        return first_line_metrics(first_line, text, layout, resume_at,
                                  space_collapse, style)

    # Step #4: Try to hyphenize
    hyphens = style.hyphens
    lang = style.lang and pyphen.language_fallback(style.lang)
    total, left, right = style.hyphenate_limit_chars
    hyphenated = False
    soft_hyphen = u'\u00ad'

    # Automatic hyphenation possible and next word is long enough
    if hyphens != 'none' and len(next_word) >= total:
        first_line_width, _ = get_size(first_line, style)
        space = max_width - first_line_width
        if style.hyphenate_limit_zone.unit == '%':
            limit_zone = max_width * style.hyphenate_limit_zone.value / 100.
        else:
            limit_zone = style.hyphenate_limit_zone.value

        if space > limit_zone or space < 0:
            # Manual hyphenation: check that the line ends with a soft hyphen
            # and add the missing hyphen
            if hyphens == 'manual':
                if first_line_text.endswith(soft_hyphen):
                    # The first line has been split on a soft hyphen
                    if u' ' in first_line_text:
                        first_line_text, next_word = (first_line_text.rsplit(
                            u' ', 1))
                        next_word = u' ' + next_word
                        layout.set_text(first_line_text)
                        lines = layout.iter_lines()
                        first_line = next(lines, None)
                        second_line = next(lines, None)
                        resume_at = len(
                            (first_line_text + u' ').encode('utf8'))
                    else:
                        first_line_text, next_word = u'', first_line_text
                soft_hyphen_indexes = [
                    match.start()
                    for match in re.finditer(soft_hyphen, next_word)
                ]
                soft_hyphen_indexes.reverse()
                dictionary_iterations = [
                    next_word[:i + 1] for i in soft_hyphen_indexes
                ]
            elif hyphens == 'auto' and lang:
                # The next word does not fit, try hyphenation
                dictionary_key = (lang, left, right, total)
                dictionary = PYPHEN_DICTIONARY_CACHE.get(dictionary_key)
                if dictionary is None:
                    dictionary = pyphen.Pyphen(lang=lang,
                                               left=left,
                                               right=right)
                    PYPHEN_DICTIONARY_CACHE[dictionary_key] = dictionary
                dictionary_iterations = [
                    start for start, end in dictionary.iterate(next_word)
                ]
            else:
                dictionary_iterations = []

            if dictionary_iterations:
                for first_word_part in dictionary_iterations:
                    new_first_line_text = first_line_text + first_word_part
                    hyphenated_first_line_text = (new_first_line_text +
                                                  style.hyphenate_character)
                    new_layout = create_layout(hyphenated_first_line_text,
                                               style, context, max_width,
                                               justification_spacing)
                    new_lines = new_layout.iter_lines()
                    new_first_line = next(new_lines, None)
                    new_second_line = next(new_lines, None)
                    new_first_line_width, _ = get_size(new_first_line, style)
                    new_space = max_width - new_first_line_width
                    if new_second_line is None and (
                            new_space >= 0
                            or first_word_part == dictionary_iterations[-1]):
                        hyphenated = True
                        layout = new_layout
                        first_line = new_first_line
                        second_line = new_second_line
                        resume_at = len(new_first_line_text.encode('utf8'))
                        if text[len(new_first_line_text)] == soft_hyphen:
                            resume_at += len(soft_hyphen.encode('utf8'))
                        break

                if not hyphenated and not first_line_text:
                    # Recreate the layout with no max_width to be sure that
                    # we don't break inside the hyphenate-character string
                    hyphenated = True
                    layout.set_text(hyphenated_first_line_text)
                    pango.pango_layout_set_width(layout.layout,
                                                 units_from_double(-1))
                    lines = layout.iter_lines()
                    first_line = next(lines, None)
                    second_line = next(lines, None)
                    resume_at = len(new_first_line_text.encode('utf8'))
                    if text[len(first_line_text)] == soft_hyphen:
                        resume_at += len(soft_hyphen.encode('utf8'))

    if not hyphenated and first_line_text.endswith(soft_hyphen):
        # Recreate the layout with no max_width to be sure that
        # we don't break inside the hyphenate-character string
        hyphenated = True
        hyphenated_first_line_text = (first_line_text +
                                      style.hyphenate_character)
        layout.set_text(hyphenated_first_line_text)
        pango.pango_layout_set_width(layout.layout, units_from_double(-1))
        lines = layout.iter_lines()
        first_line = next(lines, None)
        second_line = next(lines, None)
        resume_at = len(first_line_text.encode('utf8'))

    # Step 5: Try to break word if it's too long for the line
    overflow_wrap = style.overflow_wrap
    first_line_width, _ = get_size(first_line, style)
    space = max_width - first_line_width
    # If we can break words and the first line is too long
    if overflow_wrap == 'break-word' and space < 0:
        # Is it really OK to remove hyphenation for word-break ?
        hyphenated = False
        # TODO: Modify code to preserve W3C condition:
        # "Shaping characters are still shaped as if the word were not broken"
        # The way new lines are processed in this function (one by one with no
        # memory of the last) prevents shaping characters (arabic, for
        # instance) from keeping their shape when wrapped on the next line with
        # pango layout.  Maybe insert Unicode shaping characters in text ?
        layout.set_text(text)
        pango.pango_layout_set_width(layout.layout,
                                     units_from_double(max_width))
        layout.set_wrap(PANGO_WRAP_MODE['WRAP_CHAR'])
        temp_lines = layout.iter_lines()
        next(temp_lines, None)
        temp_second_line = next(temp_lines, None)
        temp_second_line_index = (len(text.encode('utf-8'))
                                  if temp_second_line is None else
                                  temp_second_line.start_index)
        # TODO: WRAP_CHAR is said to "wrap lines at character boundaries", but
        # it doesn't. Looks like it tries to split at word boundaries and then
        # at character boundaries if there's no enough space for a full word,
        # just as WRAP_WORD_CHAR does. That's why we have to split this text
        # twice. Find why. It may be related to the problem described in the
        # link given in step #3.
        first_line_text = utf8_slice(text, slice(temp_second_line_index))
        layout.set_text(first_line_text)
        lines = layout.iter_lines()
        first_line = next(lines, None)
        second_line = next(lines, None)
        resume_at = (first_line.length
                     if second_line is None else second_line.start_index)

    return first_line_metrics(first_line, text, layout, resume_at,
                              space_collapse, style, hyphenated,
                              style.hyphenate_character)
 def get_number_pollisyllable_words(self):
     dic = pyphen.Pyphen(lang='en')
     return len([
         word for word in self.words
         if len(dic.inserted(word).split("-")) >= 3
     ])
예제 #26
0
import re
from emoji import UNICODE_EMOJI
from ast import literal_eval

import spacy
nlp = spacy.load('en_core_web_lg')
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

import nltk
from nltk.tokenize import sent_tokenize, TweetTokenizer, casual_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('punkt')
nltk.download('vader_lexicon')

import pyphen
PYPHEN_DIC = pyphen.Pyphen(lang='en')

from collections import Counter, OrderedDict, defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()


def cosine_similarity_sklearn(documents):
    X_train_counts = tfidf_vectorizer.fit_transform(documents)
    similarities = cosine_similarity(X_train_counts)
    return similarities.mean()


all_pos_tags = [
예제 #27
0
import sys
import pyphen
import epitran
import nltk
import joblib

from nltk.tokenize import RegexpTokenizer

from inout.dta.corpus import Corpus
from inout.dta.poem import Poem

# sys.argv[1] Corpus path : ../../resources/Reim_Korpora/A_E_Parviol_Korpus/A_Parviol_Korpus
# sys.argv[2] meter model path : ./meter/meter.model.joblib

pyp = pyphen.Pyphen(lang='de')
epi = epitran.Epitran('deu-Latn')
#tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+')

c = Corpus(sys.argv[1])
poems = c.get_poems()

###für das meter model###
meter_model = sys.argv[2]
clf = joblib.load(meter_model)
tokenizer = RegexpTokenizer(r'\w+')


def word2features(sentence, index):
    word = sentence[index]
    #print(word, len(word), "	", index)
    #postag = sentence[index][1]
예제 #28
0
 def __init__(self, lexicon):
     with open(lexicon, 'r') as f:
         self.lexicon = json.load(f)
     self.fallbackDict = pyphen.Pyphen(lang='en_US')
예제 #29
0
def split_first_line(text,
                     style,
                     context,
                     max_width,
                     justification_spacing,
                     minimum=False):
    """Fit as much as possible in the available width for one line of text.

    Return ``(layout, length, resume_at, width, height, baseline)``.

    ``layout``: a pango Layout with the first line
    ``length``: length in UTF-8 bytes of the first line
    ``resume_at``: The number of UTF-8 bytes to skip for the next line.
                   May be ``None`` if the whole text fits in one line.
                   This may be greater than ``length`` in case of preserved
                   newline characters.
    ``width``: width in pixels of the first line
    ``height``: height in pixels of the first line
    ``baseline``: baseline in pixels of the first line

    """
    # See https://www.w3.org/TR/css-text-3/#white-space-property
    text_wrap = style['white_space'] in ('normal', 'pre-wrap', 'pre-line')
    space_collapse = style['white_space'] in ('normal', 'nowrap', 'pre-line')

    if not text_wrap:
        max_width = None

    # Step #1: Get a draft layout with the first line
    layout = None
    if (max_width is not None and max_width != float('inf')
            and style['font_size']):
        if max_width == 0:
            # Trying to find minimum size, let's naively split on spaces and
            # keep one word + one letter
            space_index = text.find(' ')
            if space_index == -1:
                expected_length = len(text)
            else:
                expected_length = space_index + 2  # index + space + one letter
        else:
            expected_length = int(max_width / style['font_size'] * 2.5)
        if expected_length < len(text):
            # Try to use a small amount of text instead of the whole text
            layout = create_layout(text[:expected_length], style, context,
                                   max_width, justification_spacing)
            first_line, index = layout.get_first_line()
            if index is None:
                # The small amount of text fits in one line, give up and use
                # the whole text
                layout = None
    if layout is None:
        layout = create_layout(text, style, context, max_width,
                               justification_spacing)
        first_line, index = layout.get_first_line()
    resume_at = index

    # Step #2: Don't split lines when it's not needed
    if max_width is None:
        # The first line can take all the place needed
        return first_line_metrics(first_line, text, layout, resume_at,
                                  space_collapse, style)
    first_line_width, _ = get_size(first_line, style)
    if index is None and first_line_width <= max_width:
        # The first line fits in the available width
        return first_line_metrics(first_line, text, layout, resume_at,
                                  space_collapse, style)

    # Step #3: Try to put the first word of the second line on the first line
    # https://mail.gnome.org/archives/gtk-i18n-list/2013-September/msg00006
    # is a good thread related to this problem.
    first_line_text = utf8_slice(text, slice(index))
    # We can’t rely on first_line_width, see
    # https://github.com/Kozea/WeasyPrint/issues/1051
    first_line_fits = (first_line_width <= max_width
                       or ' ' in first_line_text.strip() or can_break_text(
                           first_line_text.strip(), style['lang']))
    if first_line_fits:
        # The first line fits but may have been cut too early by Pango
        second_line_text = utf8_slice(text, slice(index, None))
    else:
        # The line can't be split earlier, try to hyphenate the first word.
        first_line_text = ''
        second_line_text = text

    next_word = second_line_text.split(' ', 1)[0]
    if next_word:
        if space_collapse:
            # next_word might fit without a space afterwards
            # only try when space collapsing is allowed
            new_first_line_text = first_line_text + next_word
            layout.set_text(new_first_line_text)
            first_line, index = layout.get_first_line()
            first_line_width, _ = get_size(first_line, style)
            if index is None and first_line_text:
                # The next word fits in the first line, keep the layout
                resume_at = len(new_first_line_text.encode('utf-8')) + 1
                return first_line_metrics(first_line, text, layout, resume_at,
                                          space_collapse, style)
            elif index:
                # Text may have been split elsewhere by Pango earlier
                resume_at = index
            else:
                # Second line is none
                resume_at = first_line.length + 1
                if resume_at >= len(text.encode('utf-8')):
                    resume_at = None
    elif first_line_text:
        # We found something on the first line but we did not find a word on
        # the next line, no need to hyphenate, we can keep the current layout
        return first_line_metrics(first_line, text, layout, resume_at,
                                  space_collapse, style)

    # Step #4: Try to hyphenate
    hyphens = style['hyphens']
    lang = style['lang'] and pyphen.language_fallback(style['lang'])
    total, left, right = style['hyphenate_limit_chars']
    hyphenated = False
    soft_hyphen = '\u00ad'

    try_hyphenate = False
    if hyphens != 'none':
        next_word_boundaries = get_next_word_boundaries(second_line_text, lang)
        if next_word_boundaries:
            # We have a word to hyphenate
            start_word, stop_word = next_word_boundaries
            next_word = second_line_text[start_word:stop_word]
            if stop_word - start_word >= total:
                # This word is long enough
                first_line_width, _ = get_size(first_line, style)
                space = max_width - first_line_width
                if style['hyphenate_limit_zone'].unit == '%':
                    limit_zone = (max_width *
                                  style['hyphenate_limit_zone'].value / 100.)
                else:
                    limit_zone = style['hyphenate_limit_zone'].value
                if space > limit_zone or space < 0:
                    # Available space is worth the try, or the line is even too
                    # long to fit: try to hyphenate
                    try_hyphenate = True

    if try_hyphenate:
        # Automatic hyphenation possible and next word is long enough
        auto_hyphenation = hyphens == 'auto' and lang
        manual_hyphenation = False
        if auto_hyphenation:
            if soft_hyphen in first_line_text or soft_hyphen in next_word:
                # Automatic hyphenation opportunities within a word must be
                # ignored if the word contains a conditional hyphen, in favor
                # of the conditional hyphen(s).
                # See https://drafts.csswg.org/css-text-3/#valdef-hyphens-auto
                manual_hyphenation = True
        else:
            manual_hyphenation = hyphens == 'manual'

        if manual_hyphenation:
            # Manual hyphenation: check that the line ends with a soft
            # hyphen and add the missing hyphen
            if first_line_text.endswith(soft_hyphen):
                # The first line has been split on a soft hyphen
                if ' ' in first_line_text:
                    first_line_text, next_word = (first_line_text.rsplit(
                        ' ', 1))
                    next_word = ' ' + next_word
                    layout.set_text(first_line_text)
                    first_line, index = layout.get_first_line()
                    resume_at = len((first_line_text + ' ').encode('utf8'))
                else:
                    first_line_text, next_word = '', first_line_text
            soft_hyphen_indexes = [
                match.start() for match in re.finditer(soft_hyphen, next_word)
            ]
            soft_hyphen_indexes.reverse()
            dictionary_iterations = [
                next_word[:i + 1] for i in soft_hyphen_indexes
            ]
        elif auto_hyphenation:
            dictionary_key = (lang, left, right, total)
            dictionary = context.dictionaries.get(dictionary_key)
            if dictionary is None:
                dictionary = pyphen.Pyphen(lang=lang, left=left, right=right)
                context.dictionaries[dictionary_key] = dictionary
            dictionary_iterations = [
                start for start, end in dictionary.iterate(next_word)
            ]
        else:
            dictionary_iterations = []

        if dictionary_iterations:
            for first_word_part in dictionary_iterations:
                new_first_line_text = (first_line_text +
                                       second_line_text[:start_word] +
                                       first_word_part)
                hyphenated_first_line_text = (new_first_line_text +
                                              style['hyphenate_character'])
                new_layout = create_layout(hyphenated_first_line_text, style,
                                           context, max_width,
                                           justification_spacing)
                new_first_line, new_index = new_layout.get_first_line()
                new_first_line_width, _ = get_size(new_first_line, style)
                new_space = max_width - new_first_line_width
                if new_index is None and (new_space >= 0 or first_word_part
                                          == dictionary_iterations[-1]):
                    hyphenated = True
                    layout = new_layout
                    first_line = new_first_line
                    index = new_index
                    resume_at = len(new_first_line_text.encode('utf8'))
                    if text[len(new_first_line_text)] == soft_hyphen:
                        # Recreate the layout with no max_width to be sure that
                        # we don't break before the soft hyphen
                        pango.pango_layout_set_width(layout.layout,
                                                     units_from_double(-1))
                        resume_at += len(soft_hyphen.encode('utf8'))
                    break

            if not hyphenated and not first_line_text:
                # Recreate the layout with no max_width to be sure that
                # we don't break before or inside the hyphenate character
                hyphenated = True
                layout.set_text(hyphenated_first_line_text)
                pango.pango_layout_set_width(layout.layout,
                                             units_from_double(-1))
                first_line, index = layout.get_first_line()
                resume_at = len(new_first_line_text.encode('utf8'))
                if text[len(first_line_text)] == soft_hyphen:
                    resume_at += len(soft_hyphen.encode('utf8'))

    if not hyphenated and first_line_text.endswith(soft_hyphen):
        # Recreate the layout with no max_width to be sure that
        # we don't break inside the hyphenate-character string
        hyphenated = True
        hyphenated_first_line_text = (first_line_text +
                                      style['hyphenate_character'])
        layout.set_text(hyphenated_first_line_text)
        pango.pango_layout_set_width(layout.layout, units_from_double(-1))
        first_line, index = layout.get_first_line()
        resume_at = len(first_line_text.encode('utf8'))

    # Step 5: Try to break word if it's too long for the line
    overflow_wrap = style['overflow_wrap']
    first_line_width, _ = get_size(first_line, style)
    space = max_width - first_line_width
    # If we can break words and the first line is too long
    if not minimum and overflow_wrap == 'break-word' and space < 0:
        # Is it really OK to remove hyphenation for word-break ?
        hyphenated = False
        # TODO: Modify code to preserve W3C condition:
        # "Shaping characters are still shaped as if the word were not broken"
        # The way new lines are processed in this function (one by one with no
        # memory of the last) prevents shaping characters (arabic, for
        # instance) from keeping their shape when wrapped on the next line with
        # pango layout. Maybe insert Unicode shaping characters in text?
        layout.set_text(text)
        pango.pango_layout_set_width(layout.layout,
                                     units_from_double(max_width))
        layout.set_wrap(PANGO_WRAP_MODE['WRAP_CHAR'])
        first_line, index = layout.get_first_line()
        resume_at = index or first_line.length
        if resume_at >= len(text.encode('utf-8')):
            resume_at = None

    return first_line_metrics(first_line, text, layout, resume_at,
                              space_collapse, style, hyphenated,
                              style['hyphenate_character'])
예제 #30
0
        'comment': py_comment,
        'capture': py_capture_stdout,
    },
    'js': {
        'comment': js_comment,
        'capture': (lambda text, capture: text),
    }
}

# TODO (mbarkhau 2016-08-21): Warn about line length in
#       code blocks, because they cause horizontal
#       scrolling.
# TODO (mbarkhau 2016-08-21): Parse lang from file
#       level metadata

HYPHEN_DICT = pyphen.Pyphen(lang='en_US')


def open(filepath, mode='r', encoding='utf-8'):
    return io.open(filepath, mode=mode, encoding=encoding)


META_PARAM_RE = re.compile(
    r"""
    (?P<key>[\w\-\.]+)
    \:
    (?P<val>[^\}\,]+)
    (?:\}|\,)
""", re.VERBOSE | re.MULTILINE)