Пример #1
0
#         line = line.strip().split()
#         line = [token for token in line if token not in STOP_WORDS]
#         DOCUMENTS.append(line)

DOCUMENTS = []
paths = Path('/home/ikram/workplace/datasets/translation_and_tafaseer/csvs'
             ).glob('*.csv')
for path in paths:
    path_in_str = str(path)
    print(path_in_str)
    df = pd.read_csv(path_in_str)
    for index, row in df.iterrows():
        if isinstance(row['translation'], str):
            translation = row['translation'].strip()
            translation = replace_numbers(
                remove_punctuation(fix_join_words(normalize(translation))))
            translation = re.sub(" +", " ", translation)
            DOCUMENTS.append(translation.split())

# DOCUMENTS = []
# df = pd.read_csv("/Users/muhammadfahid/PycharmProjects/data_preprocess/islam-360/ahsanulbayan.db.csv")
# for index, row in df.iterrows():
#     if isinstance(row['translation'], str):
#         translation = normalize(row['translation'])
#         translation = translation.strip()
#         translation = fix_join_words(translation)
#         trans = remove_punctuation(translation)
#         trans = re.sub(" +", " ", trans)
#         trans = trans.split()
#         DOCUMENTS.append(trans)
# if isinstance(row['tafseer'], str):
Пример #2
0
"""TF-IDF based similarity"""

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

from urduhack.preprocessing import remove_punctuation
from urduhack.tokenization.words import fix_join_words
from urduhack import normalize

import pandas as pd

documents = []
df = pd.read_csv("/Users/muhammadfahid/PycharmProjects/data_preprocess/islam-360/ahsanulbayan.db.csv")
for index, row in df.iterrows():
    aya_no = row["Unnamed: 0"]
    if isinstance(row['translation'], str):
        translation = remove_punctuation(fix_join_words(normalize(row["translation"])))
        documents.append(translation)

q1 = "کیا سود حرام ہے"
q2 = "عورت کے لئے فرائض"
query = normalize("یہود نصاری")
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([query] + documents)

cosine_similarities = linear_kernel(vectors[0:1], vectors).flatten()
document_scores = [item.item() for item in cosine_similarities[1:]]

print(sorted(zip(document_scores, documents), reverse=True)[:10])
Пример #3
0
from urduhack.preprocessing import remove_punctuation, replace_numbers
from urduhack import normalize
from urduhack.tokenization import sentence_tokenizer
from urduhack.tokenization.words import fix_join_words
paths = Path('/home/ikram/workplace/datasets/translation_and_tafaseer/csvs'
             ).glob('*.csv')
all_sentences = "/home/ikram/workplace/projects/Islam-360/embedding/w2v/translation_sentences.txt"
file_open = open(all_sentences, "w", encoding="utf8")
for path in paths:
    path_in_str = str(path)
    print(path_in_str)
    df = pd.read_csv(path_in_str)
    for index, row in df.iterrows():
        if isinstance(row['translation'], str):
            translation = normalize(row['translation'])
            translation = translation.strip()
            translation = fix_join_words(translation)
            for trans in sentence_tokenizer(translation):
                trans = remove_punctuation(trans)
                trans = re.sub(" +", " ", trans)
                trans = replace_numbers(trans)
                file_open.write(trans + "\n")
        if isinstance(row['tafseer'], str):
            sents = sentence_tokenizer(
                fix_join_words(normalize(row['tafseer']).strip()))
            for sent in sents:
                sent = remove_punctuation(sent)
                sent = re.sub(" +", " ", sent)
                file_open.write(sent + "\n")
file_open.close()