Python normalize示例，urduhack.normalize Python示例

示例#1

0

显示文件

def test_normalize():
    """ Testing main function"""
    text = "پاکستان ﻤﯿﮟ وسائل کی کوئی کمی نہیں ﮨﮯ۔"
    expected = normalize(text)
    assert isinstance(expected, str)
    for char in expected:
        if char == " ":
            continue
        assert char in URDU_ALL_CHARACTERS

示例#2

0

显示文件

def getTokensList(n):
    wordsWoSW = []
    words = []
    words = normalize(n)
    words = re.split('\W+', words)
    for word in words:
        if word in stop_words:
            continue
        wordsWoSW.append(word)

    return wordsWoSW

示例#3

0

显示文件

def sorted_words():
    """Sorted the words.txt file"""
    handler = open(file_name, encoding="utf8")
    words_set = set()
    for word in handler:
        word = normalize(word.strip())
        word = '_'.join(word.split())
        words_set.add(word.strip())
    handler.close()
    print(F"Total words: {len(words_set)}")

    words_set = sorted(words_set)
    with open(file_name, 'w', encoding="utf8") as the_file:
        for word in words_set:
            the_file.write(word + '\n')

示例#4

0

显示文件

"""TF-IDF based similarity"""

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

from urduhack.preprocessing import remove_punctuation
from urduhack.tokenization.words import fix_join_words
from urduhack import normalize

import pandas as pd

documents = []
df = pd.read_csv("/Users/muhammadfahid/PycharmProjects/data_preprocess/islam-360/ahsanulbayan.db.csv")
for index, row in df.iterrows():
    aya_no = row["Unnamed: 0"]
    if isinstance(row['translation'], str):
        translation = remove_punctuation(fix_join_words(normalize(row["translation"])))
        documents.append(translation)

q1 = "کیا سود حرام ہے"
q2 = "عورت کے لئے فرائض"
query = normalize("یہود نصاری")
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([query] + documents)

cosine_similarities = linear_kernel(vectors[0:1], vectors).flatten()
document_scores = [item.item() for item in cosine_similarities[1:]]

print(sorted(zip(document_scores, documents), reverse=True)[:10])

示例#5

0

显示文件

from urduhack import normalize
from arabic_reshaper import ArabicReshaper
from bidi.algorithm import get_display
from urduhack import stop_words,normalization

d = "F:\\Current Semester\\FYP\\OASRU_CLEN\\OASRU\\ResultScripts"

configuration = {
    'delete_harakat': False,
    'support_ligatures': True,
    'RIAL SIGN': True,  # Replace ر ي ا ل with ﷼
}

reshaper = ArabicReshaper(configuration=configuration)
text_to_be_reshaped = "ترجمان"
text_to_be_reshaped = normalize(text_to_be_reshaped)
text_to_be_reshaped = normalization.normalize_characters(text_to_be_reshaped)
text_to_be_reshaped = normalization.normalize_combine_characters(text_to_be_reshaped)
text_to_be_reshaped = normalization.punctuations_space(text_to_be_reshaped)
nlp = spacy.blank("ur")
reshaped_text = reshaper.reshape(text_to_be_reshaped)
doc = nlp(text_to_be_reshaped)
text = []

for each in doc:
    if str(each) not in str(stop_words.STOP_WORDS):
        #(each)
        text.append(str(each))
reshaped_text = ""

for each in text:

示例#6

0

显示文件

文件： cos_w2v_similarity.py 项目： akkefa/Islam-360

from gensim.corpora import Dictionary
from gensim.similarities import SparseTermSimilarityMatrix, MatrixSimilarity

from urduhack.preprocessing import remove_punctuation
from urduhack.tokenization.words import fix_join_words
from urduhack import normalize

import pandas as pd
import numpy as np
z
documents = []
df = pd.read_csv("/Users/muhammadfahid/PycharmProjects/Islam-360/ahsanulbayan.db.csv")
for index, row in df.iterrows():
    aya_no = row["Unnamed: 0"]
    if isinstance(row['translation'], str):
        translation = remove_punctuation(fix_join_words(normalize(row["translation"])))
        documents.append(translation.split())

q1 = "کیا سود حرام ہے"
q2 = "بنی اسرائیل کے لیے دعوت"
q3 = "منافقانہ کردار کیا ہے"
q4 = "نماز کا حکم"
q5 = "میاں بیوی تعلقات"
search_terms = normalize(q5).split()

model_path = "/Users/muhammadfahid/PycharmProjects/Islam-360/rel_model-0.9.w2v"
model = Word2Vec.load(model_path)
similarity_index = WordEmbeddingSimilarityIndex(model.wv)
dictionary = Dictionary(documents+[search_terms])
tf_idf = TfidfModel(dictionary=dictionary)

示例#7

0

显示文件

文件： lsi_similarity.py 项目： akkefa/Islam-360

from gensim.similarities import MatrixSimilarity

from urduhack.preprocessing import remove_punctuation
from urduhack.tokenization.words import fix_join_words
from urduhack import normalize

import pandas as pd

documents = []
df = pd.read_csv(
    "/Users/muhammadfahid/PycharmProjects/Islam-360/ahsanulbayan.db.csv")
for index, row in df.iterrows():
    aya_no = row["Unnamed: 0"]
    if isinstance(row['translation'], str):
        translation = remove_punctuation(
            fix_join_words(normalize(row["translation"])))
        documents.append(translation.split())

model_path = "/Users/muhammadfahid/PycharmProjects/Islam-360/rel_model-0.9.w2v"
dictionary = Dictionary(documents)
corpus = [dictionary.doc2bow(document) for document in documents]
lsi = LsiModel(corpus=corpus, id2word=dictionary, num_topics=50)

q1 = "کیا سود حرام ہے"
q2 = "بنی اسرائیل کے لیے دعوت"
q3 = "منافقانہ کردار کیا ہے"

vec_bow = dictionary.doc2bow(q2.split())
vec_lsi = lsi[vec_bow]

similarity_indexes = MatrixSimilarity(lsi[corpus])

示例#8

0

显示文件

文件： kw_and_tfidf.py 项目： akkefa/Islam-360

from urduhack.tokenization.words import fix_join_words
from urduhack import normalize
from urduhack.stop_words import STOP_WORDS

import pandas as pd

sentences = []
documents = []
df = pd.read_csv(
    "/Users/muhammadfahid/PycharmProjects/data_preprocess/islam-360/ahsanulbayan.db.csv"
)
for index, row in df.iterrows():
    aya_no = row["Unnamed: 0"]
    if isinstance(row['translation'], str):
        translation = replace_numbers(
            remove_punctuation(fix_join_words(normalize(row["translation"]))))
        documents.append(" ".join(
            [word for word in translation.split() if word not in STOP_WORDS]))
        sentences.append(translation)

keyword_processor = KeywordProcessor()
vectorizer = TfidfVectorizer()

results_kw = []
results_tfidf = []

query = normalize("کیا سود حرام ہے")
k_words = set()
for word in query.split():
    if word not in STOP_WORDS:
        k_words.add(word)

示例#9

0

显示文件

文件： MyWordCloudGenerator.py 项目： mh13159/tarjumaan

def MyWordCloudGen(imgpath, scriptpath, os):

    # d = "F:\\Current Semester\\FYP\\OASRU_CLEN\\OASRU\\ResultScripts"
    configuration = {
        'delete_harakat': False,
        'support_ligatures': True,
        'RIAL SIGN': True,  # Replace ر ي ا ل with ﷼
    }
    reshaper = ArabicReshaper(configuration=configuration)
    scripts = os.listdir(scriptpath)
    scripts.sort(key=lambda x: os.stat(os.path.join(scriptpath, x)).st_mtime)
    print((scripts))

    text_to_be_reshaped = open(path.join(scriptpath, scripts[1]),
                               encoding="UTF-8").read()
    print(text_to_be_reshaped)
    text_to_be_reshaped = normalize(text_to_be_reshaped)
    text_to_be_reshaped = normalization.normalize_characters(
        text_to_be_reshaped)
    text_to_be_reshaped = normalization.normalize_combine_characters(
        text_to_be_reshaped)
    text_to_be_reshaped = normalization.punctuations_space(text_to_be_reshaped)
    nlp = spacy.blank("ur")
    reshaped_text = reshaper.reshape(text_to_be_reshaped)
    doc = nlp(text_to_be_reshaped)
    text = []

    for each in doc:
        if str(each) not in str(stop_words.STOP_WORDS):
            #(each)
            text.append(str(each))
    reshaped_text = ""

    for each in text:
        reshaped_text = reshaped_text + " " + each

    reshaped_text = reshaper.reshape(reshaped_text)

    from bidi.algorithm import get_display
    bidi_text = get_display(reshaped_text)
    fontdir = "D:\\tarjumaan-master\\Urdu_fonts\\"
    import os
    plt.figure(figsize=(20, 15), dpi=200)
    wordcloud = WordCloud(os.getcwd() + "\\Urdu_fonts\\" +
                          "DecoType Thuluth.ttf",
                          width=2000,
                          height=1500,
                          include_numbers=True,
                          stopwords=stop_words.STOP_WORDS,
                          min_font_size=30,
                          background_color="black",
                          margin=0,
                          max_words=200).generate(bidi_text)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.savefig(imgpath + "\\image.png", format="png")
    plt.show()

    img = imgpath + "\\" + "image.png"
    print(img)
    print("Relative Path", os.path.relpath(img))
    img = os.path.relpath(img)
    return img

示例#10

0

显示文件

#         line = line.strip().split()
#         line = [token for token in line if token not in STOP_WORDS]
#         DOCUMENTS.append(line)

DOCUMENTS = []
paths = Path('/home/ikram/workplace/datasets/translation_and_tafaseer/csvs'
             ).glob('*.csv')
for path in paths:
    path_in_str = str(path)
    print(path_in_str)
    df = pd.read_csv(path_in_str)
    for index, row in df.iterrows():
        if isinstance(row['translation'], str):
            translation = row['translation'].strip()
            translation = replace_numbers(
                remove_punctuation(fix_join_words(normalize(translation))))
            translation = re.sub(" +", " ", translation)
            DOCUMENTS.append(translation.split())

# DOCUMENTS = []
# df = pd.read_csv("/Users/muhammadfahid/PycharmProjects/data_preprocess/islam-360/ahsanulbayan.db.csv")
# for index, row in df.iterrows():
#     if isinstance(row['translation'], str):
#         translation = normalize(row['translation'])
#         translation = translation.strip()
#         translation = fix_join_words(translation)
#         trans = remove_punctuation(translation)
#         trans = re.sub(" +", " ", trans)
#         trans = trans.split()
#         DOCUMENTS.append(trans)
# if isinstance(row['tafseer'], str):

示例#11

0

显示文件

import pandas as pd
from urduhack.preprocessing import remove_punctuation, replace_numbers
from urduhack import normalize
from urduhack.tokenization import sentence_tokenizer
from urduhack.tokenization.words import fix_join_words
paths = Path('/home/ikram/workplace/datasets/translation_and_tafaseer/csvs'
             ).glob('*.csv')
all_sentences = "/home/ikram/workplace/projects/Islam-360/embedding/w2v/translation_sentences.txt"
file_open = open(all_sentences, "w", encoding="utf8")
for path in paths:
    path_in_str = str(path)
    print(path_in_str)
    df = pd.read_csv(path_in_str)
    for index, row in df.iterrows():
        if isinstance(row['translation'], str):
            translation = normalize(row['translation'])
            translation = translation.strip()
            translation = fix_join_words(translation)
            for trans in sentence_tokenizer(translation):
                trans = remove_punctuation(trans)
                trans = re.sub(" +", " ", trans)
                trans = replace_numbers(trans)
                file_open.write(trans + "\n")
        if isinstance(row['tafseer'], str):
            sents = sentence_tokenizer(
                fix_join_words(normalize(row['tafseer']).strip()))
            for sent in sents:
                sent = remove_punctuation(sent)
                sent = re.sub(" +", " ", sent)
                file_open.write(sent + "\n")
file_open.close()