def test_normalize(): """ Testing main function""" text = "پاکستان ﻤﯿﮟ وسائل کی کوئی کمی نہیں ﮨﮯ۔" expected = normalize(text) assert isinstance(expected, str) for char in expected: if char == " ": continue assert char in URDU_ALL_CHARACTERS
def getTokensList(n): wordsWoSW = [] words = [] words = normalize(n) words = re.split('\W+', words) for word in words: if word in stop_words: continue wordsWoSW.append(word) return wordsWoSW
def sorted_words(): """Sorted the words.txt file""" handler = open(file_name, encoding="utf8") words_set = set() for word in handler: word = normalize(word.strip()) word = '_'.join(word.split()) words_set.add(word.strip()) handler.close() print(F"Total words: {len(words_set)}") words_set = sorted(words_set) with open(file_name, 'w', encoding="utf8") as the_file: for word in words_set: the_file.write(word + '\n')
"""TF-IDF based similarity""" from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import linear_kernel from urduhack.preprocessing import remove_punctuation from urduhack.tokenization.words import fix_join_words from urduhack import normalize import pandas as pd documents = [] df = pd.read_csv("/Users/muhammadfahid/PycharmProjects/data_preprocess/islam-360/ahsanulbayan.db.csv") for index, row in df.iterrows(): aya_no = row["Unnamed: 0"] if isinstance(row['translation'], str): translation = remove_punctuation(fix_join_words(normalize(row["translation"]))) documents.append(translation) q1 = "کیا سود حرام ہے" q2 = "عورت کے لئے فرائض" query = normalize("یہود نصاری") vectorizer = TfidfVectorizer() vectors = vectorizer.fit_transform([query] + documents) cosine_similarities = linear_kernel(vectors[0:1], vectors).flatten() document_scores = [item.item() for item in cosine_similarities[1:]] print(sorted(zip(document_scores, documents), reverse=True)[:10])
from urduhack import normalize from arabic_reshaper import ArabicReshaper from bidi.algorithm import get_display from urduhack import stop_words,normalization d = "F:\\Current Semester\\FYP\\OASRU_CLEN\\OASRU\\ResultScripts" configuration = { 'delete_harakat': False, 'support_ligatures': True, 'RIAL SIGN': True, # Replace ر ي ا ل with ﷼ } reshaper = ArabicReshaper(configuration=configuration) text_to_be_reshaped = "ترجمان" text_to_be_reshaped = normalize(text_to_be_reshaped) text_to_be_reshaped = normalization.normalize_characters(text_to_be_reshaped) text_to_be_reshaped = normalization.normalize_combine_characters(text_to_be_reshaped) text_to_be_reshaped = normalization.punctuations_space(text_to_be_reshaped) nlp = spacy.blank("ur") reshaped_text = reshaper.reshape(text_to_be_reshaped) doc = nlp(text_to_be_reshaped) text = [] for each in doc: if str(each) not in str(stop_words.STOP_WORDS): #(each) text.append(str(each)) reshaped_text = "" for each in text:
from gensim.corpora import Dictionary from gensim.similarities import SparseTermSimilarityMatrix, MatrixSimilarity from urduhack.preprocessing import remove_punctuation from urduhack.tokenization.words import fix_join_words from urduhack import normalize import pandas as pd import numpy as np z documents = [] df = pd.read_csv("/Users/muhammadfahid/PycharmProjects/Islam-360/ahsanulbayan.db.csv") for index, row in df.iterrows(): aya_no = row["Unnamed: 0"] if isinstance(row['translation'], str): translation = remove_punctuation(fix_join_words(normalize(row["translation"]))) documents.append(translation.split()) q1 = "کیا سود حرام ہے" q2 = "بنی اسرائیل کے لیے دعوت" q3 = "منافقانہ کردار کیا ہے" q4 = "نماز کا حکم" q5 = "میاں بیوی تعلقات" search_terms = normalize(q5).split() model_path = "/Users/muhammadfahid/PycharmProjects/Islam-360/rel_model-0.9.w2v" model = Word2Vec.load(model_path) similarity_index = WordEmbeddingSimilarityIndex(model.wv) dictionary = Dictionary(documents+[search_terms]) tf_idf = TfidfModel(dictionary=dictionary)
from gensim.similarities import MatrixSimilarity from urduhack.preprocessing import remove_punctuation from urduhack.tokenization.words import fix_join_words from urduhack import normalize import pandas as pd documents = [] df = pd.read_csv( "/Users/muhammadfahid/PycharmProjects/Islam-360/ahsanulbayan.db.csv") for index, row in df.iterrows(): aya_no = row["Unnamed: 0"] if isinstance(row['translation'], str): translation = remove_punctuation( fix_join_words(normalize(row["translation"]))) documents.append(translation.split()) model_path = "/Users/muhammadfahid/PycharmProjects/Islam-360/rel_model-0.9.w2v" dictionary = Dictionary(documents) corpus = [dictionary.doc2bow(document) for document in documents] lsi = LsiModel(corpus=corpus, id2word=dictionary, num_topics=50) q1 = "کیا سود حرام ہے" q2 = "بنی اسرائیل کے لیے دعوت" q3 = "منافقانہ کردار کیا ہے" vec_bow = dictionary.doc2bow(q2.split()) vec_lsi = lsi[vec_bow] similarity_indexes = MatrixSimilarity(lsi[corpus])
from urduhack.tokenization.words import fix_join_words from urduhack import normalize from urduhack.stop_words import STOP_WORDS import pandas as pd sentences = [] documents = [] df = pd.read_csv( "/Users/muhammadfahid/PycharmProjects/data_preprocess/islam-360/ahsanulbayan.db.csv" ) for index, row in df.iterrows(): aya_no = row["Unnamed: 0"] if isinstance(row['translation'], str): translation = replace_numbers( remove_punctuation(fix_join_words(normalize(row["translation"])))) documents.append(" ".join( [word for word in translation.split() if word not in STOP_WORDS])) sentences.append(translation) keyword_processor = KeywordProcessor() vectorizer = TfidfVectorizer() results_kw = [] results_tfidf = [] query = normalize("کیا سود حرام ہے") k_words = set() for word in query.split(): if word not in STOP_WORDS: k_words.add(word)
def MyWordCloudGen(imgpath, scriptpath, os): # d = "F:\\Current Semester\\FYP\\OASRU_CLEN\\OASRU\\ResultScripts" configuration = { 'delete_harakat': False, 'support_ligatures': True, 'RIAL SIGN': True, # Replace ر ي ا ل with ﷼ } reshaper = ArabicReshaper(configuration=configuration) scripts = os.listdir(scriptpath) scripts.sort(key=lambda x: os.stat(os.path.join(scriptpath, x)).st_mtime) print((scripts)) text_to_be_reshaped = open(path.join(scriptpath, scripts[1]), encoding="UTF-8").read() print(text_to_be_reshaped) text_to_be_reshaped = normalize(text_to_be_reshaped) text_to_be_reshaped = normalization.normalize_characters( text_to_be_reshaped) text_to_be_reshaped = normalization.normalize_combine_characters( text_to_be_reshaped) text_to_be_reshaped = normalization.punctuations_space(text_to_be_reshaped) nlp = spacy.blank("ur") reshaped_text = reshaper.reshape(text_to_be_reshaped) doc = nlp(text_to_be_reshaped) text = [] for each in doc: if str(each) not in str(stop_words.STOP_WORDS): #(each) text.append(str(each)) reshaped_text = "" for each in text: reshaped_text = reshaped_text + " " + each reshaped_text = reshaper.reshape(reshaped_text) from bidi.algorithm import get_display bidi_text = get_display(reshaped_text) fontdir = "D:\\tarjumaan-master\\Urdu_fonts\\" import os plt.figure(figsize=(20, 15), dpi=200) wordcloud = WordCloud(os.getcwd() + "\\Urdu_fonts\\" + "DecoType Thuluth.ttf", width=2000, height=1500, include_numbers=True, stopwords=stop_words.STOP_WORDS, min_font_size=30, background_color="black", margin=0, max_words=200).generate(bidi_text) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.savefig(imgpath + "\\image.png", format="png") plt.show() img = imgpath + "\\" + "image.png" print(img) print("Relative Path", os.path.relpath(img)) img = os.path.relpath(img) return img
# line = line.strip().split() # line = [token for token in line if token not in STOP_WORDS] # DOCUMENTS.append(line) DOCUMENTS = [] paths = Path('/home/ikram/workplace/datasets/translation_and_tafaseer/csvs' ).glob('*.csv') for path in paths: path_in_str = str(path) print(path_in_str) df = pd.read_csv(path_in_str) for index, row in df.iterrows(): if isinstance(row['translation'], str): translation = row['translation'].strip() translation = replace_numbers( remove_punctuation(fix_join_words(normalize(translation)))) translation = re.sub(" +", " ", translation) DOCUMENTS.append(translation.split()) # DOCUMENTS = [] # df = pd.read_csv("/Users/muhammadfahid/PycharmProjects/data_preprocess/islam-360/ahsanulbayan.db.csv") # for index, row in df.iterrows(): # if isinstance(row['translation'], str): # translation = normalize(row['translation']) # translation = translation.strip() # translation = fix_join_words(translation) # trans = remove_punctuation(translation) # trans = re.sub(" +", " ", trans) # trans = trans.split() # DOCUMENTS.append(trans) # if isinstance(row['tafseer'], str):
import pandas as pd from urduhack.preprocessing import remove_punctuation, replace_numbers from urduhack import normalize from urduhack.tokenization import sentence_tokenizer from urduhack.tokenization.words import fix_join_words paths = Path('/home/ikram/workplace/datasets/translation_and_tafaseer/csvs' ).glob('*.csv') all_sentences = "/home/ikram/workplace/projects/Islam-360/embedding/w2v/translation_sentences.txt" file_open = open(all_sentences, "w", encoding="utf8") for path in paths: path_in_str = str(path) print(path_in_str) df = pd.read_csv(path_in_str) for index, row in df.iterrows(): if isinstance(row['translation'], str): translation = normalize(row['translation']) translation = translation.strip() translation = fix_join_words(translation) for trans in sentence_tokenizer(translation): trans = remove_punctuation(trans) trans = re.sub(" +", " ", trans) trans = replace_numbers(trans) file_open.write(trans + "\n") if isinstance(row['tafseer'], str): sents = sentence_tokenizer( fix_join_words(normalize(row['tafseer']).strip())) for sent in sents: sent = remove_punctuation(sent) sent = re.sub(" +", " ", sent) file_open.write(sent + "\n") file_open.close()