"""TF-IDF based similarity""" from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import linear_kernel from urduhack.preprocessing import remove_punctuation from urduhack.tokenization.words import fix_join_words from urduhack import normalize import pandas as pd documents = [] df = pd.read_csv("/Users/muhammadfahid/PycharmProjects/data_preprocess/islam-360/ahsanulbayan.db.csv") for index, row in df.iterrows(): aya_no = row["Unnamed: 0"] if isinstance(row['translation'], str): translation = remove_punctuation(fix_join_words(normalize(row["translation"]))) documents.append(translation) q1 = "کیا سود حرام ہے" q2 = "عورت کے لئے فرائض" query = normalize("یہود نصاری") vectorizer = TfidfVectorizer() vectors = vectorizer.fit_transform([query] + documents) cosine_similarities = linear_kernel(vectors[0:1], vectors).flatten() document_scores = [item.item() for item in cosine_similarities[1:]] print(sorted(zip(document_scores, documents), reverse=True)[:10])
# line = line.strip().split() # line = [token for token in line if token not in STOP_WORDS] # DOCUMENTS.append(line) DOCUMENTS = [] paths = Path('/home/ikram/workplace/datasets/translation_and_tafaseer/csvs' ).glob('*.csv') for path in paths: path_in_str = str(path) print(path_in_str) df = pd.read_csv(path_in_str) for index, row in df.iterrows(): if isinstance(row['translation'], str): translation = row['translation'].strip() translation = replace_numbers( remove_punctuation(fix_join_words(normalize(translation)))) translation = re.sub(" +", " ", translation) DOCUMENTS.append(translation.split()) # DOCUMENTS = [] # df = pd.read_csv("/Users/muhammadfahid/PycharmProjects/data_preprocess/islam-360/ahsanulbayan.db.csv") # for index, row in df.iterrows(): # if isinstance(row['translation'], str): # translation = normalize(row['translation']) # translation = translation.strip() # translation = fix_join_words(translation) # trans = remove_punctuation(translation) # trans = re.sub(" +", " ", trans) # trans = trans.split() # DOCUMENTS.append(trans) # if isinstance(row['tafseer'], str):
from gensim.similarities import MatrixSimilarity from urduhack.preprocessing import remove_punctuation from urduhack.tokenization.words import fix_join_words from urduhack import normalize import pandas as pd documents = [] df = pd.read_csv( "/Users/muhammadfahid/PycharmProjects/Islam-360/ahsanulbayan.db.csv") for index, row in df.iterrows(): aya_no = row["Unnamed: 0"] if isinstance(row['translation'], str): translation = remove_punctuation( fix_join_words(normalize(row["translation"]))) documents.append(translation.split()) model_path = "/Users/muhammadfahid/PycharmProjects/Islam-360/rel_model-0.9.w2v" dictionary = Dictionary(documents) corpus = [dictionary.doc2bow(document) for document in documents] lsi = LsiModel(corpus=corpus, id2word=dictionary, num_topics=50) q1 = "کیا سود حرام ہے" q2 = "بنی اسرائیل کے لیے دعوت" q3 = "منافقانہ کردار کیا ہے" vec_bow = dictionary.doc2bow(q2.split()) vec_lsi = lsi[vec_bow] similarity_indexes = MatrixSimilarity(lsi[corpus])
from urduhack.preprocessing import remove_punctuation, replace_numbers from urduhack import normalize from urduhack.tokenization import sentence_tokenizer from urduhack.tokenization.words import fix_join_words paths = Path('/home/ikram/workplace/datasets/translation_and_tafaseer/csvs' ).glob('*.csv') all_sentences = "/home/ikram/workplace/projects/Islam-360/embedding/w2v/translation_sentences.txt" file_open = open(all_sentences, "w", encoding="utf8") for path in paths: path_in_str = str(path) print(path_in_str) df = pd.read_csv(path_in_str) for index, row in df.iterrows(): if isinstance(row['translation'], str): translation = normalize(row['translation']) translation = translation.strip() translation = fix_join_words(translation) for trans in sentence_tokenizer(translation): trans = remove_punctuation(trans) trans = re.sub(" +", " ", trans) trans = replace_numbers(trans) file_open.write(trans + "\n") if isinstance(row['tafseer'], str): sents = sentence_tokenizer( fix_join_words(normalize(row['tafseer']).strip())) for sent in sents: sent = remove_punctuation(sent) sent = re.sub(" +", " ", sent) file_open.write(sent + "\n") file_open.close()