Exemplo n.º 1
0
def text_to_bidi(text):
    text = normalize_spaces(text)
    configured_reshaper = ArabicReshaper(
        configuration={'use_unshaped_instead_of_isolated': True})
    reshaped_text = configured_reshaper.reshape(text)
    bidi_text = get_display(reshaped_text)
    return bidi_text
Exemplo n.º 2
0
    def _draw_textarea(self, canvas: Canvas, op: OrderPosition, order: Order,
                       o: dict):
        font = o['fontfamily']
        if o['bold']:
            font += ' B'
        if o['italic']:
            font += ' I'

        align_map = {'left': TA_LEFT, 'center': TA_CENTER, 'right': TA_RIGHT}
        style = ParagraphStyle(name=uuid.uuid4().hex,
                               fontName=font,
                               fontSize=float(o['fontsize']),
                               leading=float(o['fontsize']),
                               autoLeading="max",
                               textColor=Color(o['color'][0] / 255,
                                               o['color'][1] / 255,
                                               o['color'][2] / 255),
                               alignment=align_map[o['align']])
        text = conditional_escape(
            self._get_text_content(op, order, o)
            or "", ).replace("\n", "<br/>\n")

        # reportlab does not support RTL, ligature-heavy scripts like Arabic. Therefore, we use ArabicReshaper
        # to resolve all ligatures and python-bidi to switch RTL texts.
        configuration = {
            'delete_harakat': True,
            'support_ligatures': False,
        }
        reshaper = ArabicReshaper(configuration=configuration)
        try:
            text = "<br/>".join(
                get_display(reshaper.reshape(l)) for l in text.split("<br/>"))
        except:
            logger.exception('Reshaping/Bidi fixes failed on string {}'.format(
                repr(text)))

        p = Paragraph(text, style=style)
        w, h = p.wrapOn(canvas, float(o['width']) * mm, 1000 * mm)
        # p_size = p.wrap(float(o['width']) * mm, 1000 * mm)
        ad = getAscentDescent(font, float(o['fontsize']))
        canvas.saveState()
        # The ascent/descent offsets here are not really proven to be correct, they're just empirical values to get
        # reportlab render similarly to browser canvas.
        if o.get('downward', False):
            canvas.translate(float(o['left']) * mm, float(o['bottom']) * mm)
            canvas.rotate(o.get('rotation', 0) * -1)
            p.drawOn(canvas, 0, -h - ad[1] / 2)
        else:
            canvas.translate(
                float(o['left']) * mm,
                float(o['bottom']) * mm + h)
            canvas.rotate(o.get('rotation', 0) * -1)
            p.drawOn(canvas, 0, -h - ad[1])
        canvas.restoreState()
Exemplo n.º 3
0
    def _draw_textarea(self, canvas: Canvas, op: OrderPosition, order: Order,
                       o: dict):
        font = o['fontfamily']
        if o['bold']:
            font += ' B'
        if o['italic']:
            font += ' I'

        align_map = {'left': TA_LEFT, 'center': TA_CENTER, 'right': TA_RIGHT}
        style = ParagraphStyle(name=uuid.uuid4().hex,
                               fontName=font,
                               fontSize=float(o['fontsize']),
                               leading=float(o['fontsize']),
                               autoLeading="max",
                               textColor=Color(o['color'][0] / 255,
                                               o['color'][1] / 255,
                                               o['color'][2] / 255),
                               alignment=align_map[o['align']])
        text = re.sub(
            "<br[^>]*>", "<br/>",
            bleach.clean(self._get_text_content(op, order, o) or "",
                         tags=["br"],
                         attributes={},
                         styles=[],
                         strip=True))

        # reportlab does not support RTL, ligature-heavy scripts like Arabic. Therefore, we use ArabicReshaper
        # to resolve all ligatures and python-bidi to switch RTL texts.
        configuration = {
            'delete_harakat': True,
            'support_ligatures': False,
        }
        reshaper = ArabicReshaper(configuration=configuration)
        text = "<br/>".join(
            get_display(reshaper.reshape(l)) for l in text.split("<br/>"))

        p = Paragraph(text, style=style)
        p.wrapOn(canvas, float(o['width']) * mm, 1000 * mm)
        # p_size = p.wrap(float(o['width']) * mm, 1000 * mm)
        ad = getAscentDescent(font, float(o['fontsize']))
        p.drawOn(canvas,
                 float(o['left']) * mm,
                 float(o['bottom']) * mm - ad[1])
Exemplo n.º 4
0
def main():
    """
        Description: Main function
    """

    # Argument parsing
    args = parse_arguments()

    # Create the directory if it does not exist.
    try:
        os.makedirs(args.output_dir)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    # Creating word list
    if args.dict:
        lang_dict = []
        if os.path.isfile(args.dict):
            with open(args.dict, "r", encoding="utf8", errors="ignore") as d:
                lang_dict = [l for l in d.read().splitlines() if len(l) > 0]
        else:
            sys.exit("Cannot open dict")
    else:
        lang_dict = load_dict(args.language)
    with open('./trdg/dicts/en.txt', 'r', encoding='utf8',
              errors='ignore') as f:
        en_dict = [i for i in f.read().splitlines() if len(i) > 0]
    # Create font (path) list
    if args.font_dir:
        fonts = [
            os.path.join(args.font_dir, p) for p in os.listdir(args.font_dir)
            if os.path.splitext(p)[1] == ".ttf"
        ]
    elif args.font:
        if os.path.isfile(args.font):
            fonts = [args.font]
        else:
            sys.exit("Cannot open font")
    else:
        fonts = load_fonts(args.language)

    # Creating synthetic sentences (or word)
    strings = []

    if args.use_wikipedia:
        print('use_wikipedia')
        strings = create_strings_from_wikipedia(args.length, args.count,
                                                args.language)
    elif args.input_file != "":
        print('input_file')
        strings = create_strings_from_file(args.input_file, args.count)
    elif args.random_sequences:
        print('random_sequences')
        strings = create_strings_randomly(
            args.length,
            args.random,
            args.count,
            args.include_letters,
            args.include_numbers,
            args.include_symbols,
            args.language,
        )
        # Set a name format compatible with special characters automatically if they are used
        if args.include_symbols or True not in (
                args.include_letters,
                args.include_numbers,
                args.include_symbols,
        ):
            args.name_format = 2


#     else :
#         print('create_strings_from_dict')
#         strings = create_strings_from_dict(
#             args.length,
#             args.random,
#             args.count,
#             lang_dict
#             )
    else:
        print(make_my_strings)
        strings = make_my_strings(
            args.count,
            lang_dict,
            en_dict,
        )

    if args.language == "ar":
        from arabic_reshaper import ArabicReshaper

        arabic_reshaper = ArabicReshaper()
        strings = [
            " ".join([arabic_reshaper.reshape(w) for w in s.split(" ")[::-1]])
            for s in strings
        ]
    if args.case == "upper":
        strings = [x.upper() for x in strings]
    if args.case == "lower":
        strings = [x.lower() for x in strings]

    string_count = len(strings)

    p = Pool(args.thread_count)
    for _ in tqdm(
            p.imap_unordered(
                FakeTextDataGenerator.generate_from_tuple,
                zip(
                    [i for i in range(0, string_count)],
                    strings,
                    [
                        fonts[rnd.randrange(0, len(fonts))]
                        for _ in range(0, string_count)
                    ],
                    [args.output_dir] * string_count,
                    [args.format] * string_count,
                    [args.extension] * string_count,
                    [args.skew_angle] * string_count,
                    [args.random_skew] * string_count,
                    [args.blur] * string_count,
                    [args.random_blur] * string_count,
                    [args.background] * string_count,
                    [args.distorsion] * string_count,
                    [args.distorsion_orientation] * string_count,
                    [args.handwritten] * string_count,
                    [args.name_format] * string_count,
                    [args.width] * string_count,
                    [args.alignment] * string_count,
                    [args.text_color] * string_count,
                    [args.orientation] * string_count,
                    [args.space_width] * string_count,
                    [args.character_spacing] * string_count,
                    [args.margins] * string_count,
                    [args.fit] * string_count,
                    [args.output_mask] * string_count,
                    [args.word_split] * string_count,
                    [args.image_dir] * string_count,
                ),
            ),
            total=args.count,
    ):
        pass
    p.terminate()

    if args.name_format == 2:
        # Create file with filename-to-label connections
        with open(os.path.join(args.output_dir, "labels.txt"),
                  "w",
                  encoding="utf8") as f:
            for i in range(string_count):
                file_name = str(i) + "." + args.extension
                f.write("{}\t{}\n".format(file_name, strings[i]))
Exemplo n.º 5
0
d = "F:\\Current Semester\\FYP\\OASRU_CLEN\\OASRU\\ResultScripts"

configuration = {
    'delete_harakat': False,
    'support_ligatures': True,
    'RIAL SIGN': True,  # Replace ر ي ا ل with ﷼
}

reshaper = ArabicReshaper(configuration=configuration)
text_to_be_reshaped = "ترجمان"
text_to_be_reshaped = normalize(text_to_be_reshaped)
text_to_be_reshaped = normalization.normalize_characters(text_to_be_reshaped)
text_to_be_reshaped = normalization.normalize_combine_characters(text_to_be_reshaped)
text_to_be_reshaped = normalization.punctuations_space(text_to_be_reshaped)
nlp = spacy.blank("ur")
reshaped_text = reshaper.reshape(text_to_be_reshaped)
doc = nlp(text_to_be_reshaped)
text = []

for each in doc:
    if str(each) not in str(stop_words.STOP_WORDS):
        #(each)
        text.append(str(each))
reshaped_text = ""

for each in text:
        reshaped_text = reshaped_text+" "+each
    
reshaped_text = reshaper.reshape(reshaped_text)

from bidi.algorithm import get_display
Exemplo n.º 6
0
import arabic_reshaper

text_to_be_reshaped = 'اللغة العربية رائعة'
reshaped_text = arabic_reshaper.reshape(text_to_be_reshaped)
print(reshaped_text)

from arabic_reshaper import ArabicReshaper
configuration = {
    'delete_harakat': False,
    'support_ligatures': True,
    'RIAL SIGN': True,  # Replace ر ي ا ل with ﷼
}
reshaper = ArabicReshaper(configuration=configuration)
l1='ل'
l2='ا'
l3='ر'
l4='ي'
text_to_be_reshaped = 'ب ﺭ ﻱ ﺕ' # had to split the string for display
reshaped_text = reshaper.reshape(text_to_be_reshaped.replace(' ',''))
print(reshaped_text)
Exemplo n.º 7
0
def MyWordCloudGen(imgpath, scriptpath, os):

    # d = "F:\\Current Semester\\FYP\\OASRU_CLEN\\OASRU\\ResultScripts"
    configuration = {
        'delete_harakat': False,
        'support_ligatures': True,
        'RIAL SIGN': True,  # Replace ر ي ا ل with ﷼
    }
    reshaper = ArabicReshaper(configuration=configuration)
    scripts = os.listdir(scriptpath)
    scripts.sort(key=lambda x: os.stat(os.path.join(scriptpath, x)).st_mtime)
    print((scripts))

    text_to_be_reshaped = open(path.join(scriptpath, scripts[1]),
                               encoding="UTF-8").read()
    print(text_to_be_reshaped)
    text_to_be_reshaped = normalize(text_to_be_reshaped)
    text_to_be_reshaped = normalization.normalize_characters(
        text_to_be_reshaped)
    text_to_be_reshaped = normalization.normalize_combine_characters(
        text_to_be_reshaped)
    text_to_be_reshaped = normalization.punctuations_space(text_to_be_reshaped)
    nlp = spacy.blank("ur")
    reshaped_text = reshaper.reshape(text_to_be_reshaped)
    doc = nlp(text_to_be_reshaped)
    text = []

    for each in doc:
        if str(each) not in str(stop_words.STOP_WORDS):
            #(each)
            text.append(str(each))
    reshaped_text = ""

    for each in text:
        reshaped_text = reshaped_text + " " + each

    reshaped_text = reshaper.reshape(reshaped_text)

    from bidi.algorithm import get_display
    bidi_text = get_display(reshaped_text)
    fontdir = "D:\\tarjumaan-master\\Urdu_fonts\\"
    import os
    plt.figure(figsize=(20, 15), dpi=200)
    wordcloud = WordCloud(os.getcwd() + "\\Urdu_fonts\\" +
                          "DecoType Thuluth.ttf",
                          width=2000,
                          height=1500,
                          include_numbers=True,
                          stopwords=stop_words.STOP_WORDS,
                          min_font_size=30,
                          background_color="black",
                          margin=0,
                          max_words=200).generate(bidi_text)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.savefig(imgpath + "\\image.png", format="png")
    plt.show()

    img = imgpath + "\\" + "image.png"
    print(img)
    print("Relative Path", os.path.relpath(img))
    img = os.path.relpath(img)
    return img