Пример #1
0
 def __init__(self, inFile, outFile):
     self.inFile = inFile
     self.outFile = outFile
     self.normalizer = Normalizer()
     self.tagger = POSTagger(model='resources/postagger.model')
     self.lemmatizer = Lemmatizer()
     self.stemmer = Stemmer()
def doc_normalizer(doc):
    normalized_doc_list=[]
    normalizer=Normalizer()
    for i in range(len(doc)):
        normalized_doc_list.append(normalizer.normalize(doc[i]))
                
    return normalized_doc_list
Пример #3
0
    def score(self, sentences):
        # Predict
        pos, neg, neu = 0, 0, 0
        stemmer = Stemmer()
        classifier = self.__get_model()
        normalizer = Normalizer()

        sentences = sent_tokenize(sentences)

        for sentence in sentences:
            sentence = normalizer.normalize(sentence)
            words = word_tokenize(sentence)

            for word in words:
                stemmer.stem(word)
                class_result = classifier.classify(self.__word_feats(word))
                if class_result == 'neg':
                    neg = neg + 1
                if class_result == 'pos':
                    pos = pos + 1
                if class_result == 'neu':
                    neu = neu + 1

        positive_sentiment = str(float(pos) / len(words))
        # print('Positive: ' + positive_sentiment)
        neutral_sentiment = str(float(neu) / len(words))
        # print('Neutral: ' + neutral_sentiment)
        negative_sentiment = str(-float(neg) / len(words))
        # print('Negative: ' + negative_sentiment)

        total_sentiment = (float(positive_sentiment)+float(negative_sentiment)) / 2
        # print('Total (Avg): ' + str(total_sentiment))

        return total_sentiment
def prepare():
    normalizer = Normalizer()
    stemmer = Stemmer()

    string = '''ویکی پدیای انگلیسی در تاریخ ۱۵ ژانویه ۲۰۰۱ (۲۶ دی ۱۳۷۹) به صورت مکملی برای دانشنامهٔ تخصصی نیوپدیا نوشته شد. بنیان گذاران آن «جیمی ویلز» و «لری سنگر» هستند. هم اکنون بنیاد غیرانتفاعی ویکی مدیا پروژهٔ ویکی پدیا را پشتیبانی می کند. میزبان های اینترنتی اصلی این وبگاه در شهر تامپای فلوریدا هستند؟ همچنین میزبان های اضافی دیگری هم در شهرهای آمستردام، شیراز و سئول به این وبگاه یاری می رسانند؟'''

    tokenizer = WordTokenizer(join_verb_parts=True,
                              separate_emoji=True,
                              replace_links=True,
                              replace_IDs=True,
                              replace_emails=True,
                              replace_numbers=True,
                              replace_hashtags=True)

    labels = {'،': 'COMMA', '.': 'DOT', '؟': 'QMARK'}
    normal_string = normalizer.normalize(string)
    for label in labels.keys():
        print(normal_string.find(label))

    exit(0)
    for i, sent in enumerate([1, 2, 3, 4]):
        entities = []
        (10, 15, 'PrdName')
        for label in labels.keys():
            print(f'{label} in {i}', label in sent)
        record = (sent, {'entities': entities})

        print()
Пример #5
0
def prepare_line(line):
    global normalizer, incorrect, correct, unicode_redundant_chars, whitespace_chars, digits, punct_str, punctuations
    if normalizer is None:
        normalizer = Normalizer()
        incorrect, correct = CorrectCodings.loadCodings("TableCodings.txt")

    line = normalizer.normalize(line)
    line = CorrectCodings.CorrectCodingInLine(line, incorrect, correct)
    # remove prefix
    pat = re.compile(r"https?(.)*[^\s]+")
    line = re.sub(pat, r" ", line)

    pat = re.compile(r"\\n")
    line = re.sub(pat, "\n", line)
    pat = re.compile(r"([^\sا-ی۰-۹a-zA-Z\d])")
    line = re.sub(pat, r" \1 ", line)

    for p in punctuations:
        pat = re.compile(r"([" + punct_str + "])")
        line = re.sub(pat, r" \1 ", line)

    pat = re.compile(r"([" + digits + "]+)")
    line = re.sub(pat, r" \1 ", line)
    pat = re.compile(r" +")
    line = re.sub(pat, r" ", line)
    pat = re.compile(r"\n+")
    line = re.sub(pat, r" \n ", line)
    pat = re.compile("[" + whitespace_chars + "]+")
    line = re.sub(pat, r" ", line)
    line = line.strip()
    return line
Пример #6
0
    def handle(self, *args, **options):
        articles = Article.objects.filter(is_vectorized=False)

        N = Normalizer()
        FT = fasttext.load_model(options['path'])

        index = 0
        for article in articles:
            try:
                if index % 100 == 0:
                    print(index)
                text = N.normalize(article.text)
                text = text.translate(str.maketrans('', '', punctuation))
                text = text.split()
                text = [word for word in text if len(word) > 2]
                vector = nan_to_num(
                    mean([FT.get_word_vector(w) for w in text], axis=0))
                vector = vector / (vector.dot(vector))**0.5
                obj = ArticleVector(article=article, embedding=vector.tolist())
                obj.save()
                article.is_vectorized = True
                article.save()
                index += 1
            except Exception as e:
                print(e)
Пример #7
0
    def process_text(self, text: str) -> Dict[str, int]:
        """
        Splits a long text into words.
        If `persian_normalize` attribute has been set to True, normalizes `text` with Hazm Normalizer.
        If `include_numbers` attribute has been set to False, removes all Persian, English and Arabic numbers from
        text`.
        :param text: The text we want to process
        :return: a dictionary. keys are words and values are the frequencies.
        """
        flags = (
            re.UNICODE if version < '3' and type(text) is unicode  # noqa: F821
            else 0)

        if self.persian_normalize:
            normalizer = Normalizer()
            text = normalizer.normalize(text)
        if not self.include_numbers:
            text = re.sub(r"[0-9\u06F0-\u06F9\u0660-\u0669]", "", text)

        if self.regexp:
            words = re.findall(self.regexp, text, flags)
        else:
            words = word_tokenize(text)

        if self.collocations:
            word_counts = unigrams_and_bigrams(words, self.normalize_plurals)
        else:
            word_counts, _ = process_tokens(words, self.normalize_plurals)

        return word_counts
Пример #8
0
 def normalize(self):
     """
     :return:
     """
     normalizer = Normalizer()
     for line in self.data.split('\n'):
         if line != "":
             self.normalize_text.append(normalizer.normalize(line))
     return self.normalize_text
def statement_pre_processing(input_statement):
    normalizer = Normalizer()
    lemmatizer = Lemmatizer()
    input_statement = normalizer.normalize(input_statement)
    input_statement = [
        lemmatizer.lemmatize(word) for word in word_tokenize(input_statement)
        if word not in stops
    ]
    return input_statement
Пример #10
0
    def __init__(self):
        # persian words normalizer
        self.normalizer = Normalizer()

        # load stopwords
        logger.info(f"Loading stopwords from {DATA_DIR / 'stopwords.txt'}")
        stop_words = open(DATA_DIR / 'stopwords.txt').readlines()
        stop_words = map(str.strip, stop_words)
        self.stop_words = set(map(self.normalizer.normalize, stop_words))
Пример #11
0
    def __init__(self, path_dataset, path_stopwords):

        self.path_dataset = path_dataset

        self.path_stopwords = path_stopwords

        self.stopwords = self.file_reader(self.path_stopwords)

        self.normalizer = Normalizer()
Пример #12
0
    def bigram_cleaner(text):
        text = re.sub(Text_cleaner.persian_regex, ' ', text)
        text = re.sub('[ ]+', ' ', text)

        normalizer = Normalizer()
        text = normalizer.normalize(text)

        tokenized = word_tokenize(text)
        return tokenized
Пример #13
0
def tokenize(paragraph, wanted_list):
    normal = Normalizer(remove_extra_spaces=True,
                        punctuation_spacing=True,
                        persian_style=False,
                        persian_numbers=False,
                        remove_diacritics=False,
                        affix_spacing=False,
                        token_based=False)
    for sentence in sent_tokenize(normal.normalize(paragraph)):
        wanted_list.append(sentence)
Пример #14
0
def test_word_visualization(model_path, some_words):
    normalizer = Normalizer()
    model = word2vec.Word2Vec.load(model_path)
    vectors = [model[normalizer.normalize(word)] for word in some_words if
               normalizer.normalize(word) in model.vocab.keys()]
    # print(model[normalizer.normalize('فرهنگ')])
    # print(model.similarity('فرهنگ', 'تمدن'))
    # print(vectors)
    rd = W2VPersianVis(model_path, selected_words=some_words)
    rd.show_plot()
Пример #15
0
def test_word_visualization(model_path, some_words):
    normalizer = Normalizer()
    model = word2vec.Word2Vec.load(model_path)
    vectors = [
        model[normalizer.normalize(word)] for word in some_words
        if normalizer.normalize(word) in model.vocab.keys()
    ]
    # print(model[normalizer.normalize('فرهنگ')])
    # print(model.similarity('فرهنگ', 'تمدن'))
    # print(vectors)
    rd = W2VPersianVis(model_path, selected_words=some_words)
    rd.show_plot()
Пример #16
0
 def preprocess(self, cm):
     cm = ''.join([c for c in str(cm) if c not in punctuation])
     cm = self._numbers_to_english(cm)
     cm = re.sub(r"[0-9]", '', cm)
     cm = cm.replace('\u200c', ' ').replace('\n',
                                            '').replace('\r', '').replace(
                                                'ي', 'ی').replace('ك', 'ک')
     normalizer = Normalizer()
     cm = normalizer.normalize(cm)
     tokens = word_tokenize(cm)
     cm = ' '.join([x for x in tokens if x not in self.stopwords])
     return cm
Пример #17
0
def pipeline_sentence(sentence, model, tokenizer):
    sentence = change_words(sentence)

    normalizer = Normalizer()
    sentence = normalizer.normalize(sentence)
    sentence_lem = ' '.join([
        Lemmatizer().lemmatize(x)
        for x in word_tokenize(normalizer.normalize(sentence))
    ])
    nlp = pipeline("ner", model=model, tokenizer=tokenizer)
    sentence_ner = nlp(sentence)
    sentence_ner_lem = nlp(sentence_lem)
    return sentence_ner, sentence_ner_lem, sentence_lem, sentence
Пример #18
0
def preprocess(doc):
    stemmer = Stemmer()
    lemmatizer = Lemmatizer()
    normalizer = Normalizer()
    doc = normalizer.normalize(doc)
    tokenized = re.split(' |-', doc)
    for w in tokenized[:]:
        if w in stopwords:
            tokenized.remove(w)
    stemmed = [stemmer.stem(w) for w in tokenized]
    new_words = [word for word in stemmed if word.isalnum()]
    lemmatized = [lemmatizer.lemmatize(w) for w in new_words]
    return lemmatized
Пример #19
0
def entofa(bot, update):
	
    per = ["ﺽ", "ﺹ", "ﺙ",  "ﻕ",  "ﻑ",  "ﻍ",  "ﻉ",  "ﻩ",  "ﺥ",  "ﺡ",  "ﺝ",  "چ",  "ﺵ",  "ﺱ",  "ی",
        "ﺏ",  "ﻝ",  "ﺍ",  "ﺕ",  "ﻥ",  "ﻡ",  "ک",  "گ",  "ﻅ",  "ﻁ",  "ﺯ",  "ﺭ",  "ﺫ",  "ﺩ",  "پ",  "ﻭ"]
    
    eng = ["q",  "w",  "e",  "r",  "t",  "y",  "u",  "i",  "o",  "p",  "[",  "]",  "a",  "s",  "d",
        "f",  "g",  "h",  "j",  "k",  "l",  ";",  "'",  "z",  "x",  "c",  "v",  "b",  "n",  "m",  ","]
    s = update.message.text
    for i in range(len(per)):
        s = s.replace(eng[i], per[i])
    normalizer = Normalizer()
    s = normalizer.normalize(s)
    bot.sendMessage(update.message.chat_id, text=s)
def dataset_cleaner(dataset):
    statements = []
    normalizer = Normalizer()
    lemmatizer = Lemmatizer()
    for i in range(len(dataset)):
        normalized_statement = normalizer.normalize(dataset[i])
        # for sentence in sent_tokenize(dataset[i]):
        word_list = [
            lemmatizer.lemmatize(word)
            for word in word_tokenize(normalized_statement)
            if word not in stops
        ]
        statements.append(word_list)
    return statements
Пример #21
0
def document(filepath):
    f = open(filepath, 'r', encoding='utf-8', errors='ignore')
    txt = f.read()
    f.close()

    txt = remove_punctuation(txt)
    
    normalizer = Normalizer()
    txt = normalizer.normalize(txt)
    
    document = word_tokenize(txt)
    
    document = [word for word in document if word not in stop_words and not word.isdigit()]
    
    return document
Пример #22
0
class PersianTextPreProcessor:
    def __init__(self):
        self.stemmer = Stemmer()
        self.normalizer = Normalizer()
        self.punctuations = string.punctuation

    def process_single_word(self, word):
        word = word.lower()
        word = re.sub('\d+', '', word)
        word = word.translate(
            str.maketrans(self.punctuations, ' ' * len(self.punctuations)))
        word = ' '.join(
            re.sub(r'[^ضصثقفغعهخحجچشسیبلاتنمکگظطزرذدپوئژآؤ \n]', ' ',
                   word).split())
        word = word.strip()
        word = self.normalizer.normalize(word)
        word = self.stemmer.stem(word)
        return word

    def pre_stopword_process(self, text):
        # text = self.persian_text_cleaner.get_sentences(text)
        text = text.lower()
        text = re.sub('\d+', '', text)
        text = text.translate(
            str.maketrans(self.punctuations, ' ' * len(self.punctuations)))
        text = ' '.join(
            re.sub(r'[^ضصثقفغعهخحجچشسیبلاتنمکگظطزرذدپوئژآؤ \n]', ' ',
                   text).split())
        text = text.strip()
        normalized_text = self.normalizer.normalize(text)
        words = word_tokenize(normalized_text)
        words = [w for w in words if w != '.']
        return words

    def clean_text(self, text, stopwords, remove_stopwords=True, stem=True):
        words = self.pre_stopword_process(text)
        if remove_stopwords:
            words = [w for w in words if w not in stopwords]

        if stem:
            words = [self.stemmer.stem(w) for w in words]
        return words

    def stem(self, words):
        words = [self.stemmer.stem(w) for w in words]
        return words
Пример #23
0
class HazmNormalizer(Component):
    def __init__(self, component_config: Dict[Text, Any] = None) -> None:
        super().__init__(component_config)
        self._normalizer = Normalizer()

    def process(self, message: Message, **kwargs: Any) -> None:
        message.text = self._normalizer.normalize(message.text)
        exclude_items = {}
        if 'exclude_items' in kwargs:
            exclude_items = {x: x for x in kwargs['exclude_items']}
        for key, value in message:
            if key in exclude_items:
                continue
            if isinstance(value, str):
                message[key] = self._normalizer.normalize(value)
            elif isinstance(value, list):
                for idx, item_value in enumerate(value):
                    value[idx] = self._normalizer.normalize(item_value)
Пример #24
0
class PoemSentences(object):
    def __init__(self, poems_path):
        self.poems_path = poems_path
        self.normalizer = Normalizer()

    def __iter__(self):
        for poem_file in os.listdir(self.poems_path):
            for sentence in open(os.path.join(self.poems_path, poem_file)):
                yield word_tokenize(
                    self.normalizer.normalize(sentence.replace('هٔ', 'ه')))
Пример #25
0
 def normalize_words(words: Iterable) -> List[str]:
     """
     This method gets an Iterable containing some Farsi words as elements, normalizes them using Hazm and then
     returns a list of normalized words.
     :param words: an iterable including words
     :return: A list of normalized elements of the `words` iterable.
     """
     combined_words: str = "".join(x + "\n" for x in words)
     normalizer: Normalizer = Normalizer()
     normalized_combined_words: str = normalizer.normalize(combined_words)
     return normalized_combined_words.split("\n")
Пример #26
0
def process_text(text):
    normalize=Normalizer()
    text=normalize.normalize(text)
    text = text.replace("_", " ")
    text = text.replace(',', ' ')
    text=text.replace("\u220c","")
    text=text.replace("\u200c","")
    text=text.replace("-","")
    # text = text.replace('/', ' ')
    text = text.replace('(', ' ')
    text = text.replace(')', ' ')
    text = text.replace('.', ' ')
    text=text.replace("،"," ")
    text=text.replace("«"," ")
    text=text.replace("»"," ")
    # Convert text string to a list of words
    t = re.findall("[\u0627-\u06FF]+|<S>|</s>|\?|//", text)  # just split word by space to space and omit other thing
    lemma=Lemmatizer()
    text=[lemma.lemmatize(x) for x in t]
    return text
Пример #27
0
def clean(sentence):
    #trim digits
    ind = 0
    for i in range(len(sentence)):
        if (sentence[i] in FARSI_DIGITS or sentence[i] in ENGLISH_DIGITS):
            ind += 1
        else:
            break
    sentence = sentence[ind:]

    #remove Non-Alphanumeric
    res = []
    for i in range(len(sentence)):
        if (sentence[i] in FARSI_ALPHABET or sentence[i] in FARSI_DIGITS):
            res.append(sentence[i])
    sentence = "".join(res)
    normalizer = Normalizer()
    sentence = normalizer.normalize(sentence)

    return sentence
Пример #28
0
def prepare_line(line):
    global normalizer, incorrect, correct, unicode_redundant_chars, whitespace_chars, digits, punct_str, punctuations
    if normalizer is None:
        normalizer = Normalizer()
        incorrect, correct = loadCodings("TableCodings.txt")

    line = normalizer.normalize(line)
    line = CorrectCodingInLine(line, incorrect, correct)

    pat = re.compile(r"([" + re.escape(punct_str) + "])")
    line = re.sub(pat, r" \1 ", line)

    pat = re.compile(r"([" + digits + "]+)")
    line = re.sub(pat, r" \1 ", line)
    pat = re.compile(r"\n+")
    line = re.sub(pat, r" \n ", line)
    pat = re.compile("[" + whitespace_chars + "]+")
    line = re.sub(pat, r" ", line)
    line = line.strip()
    return line
Пример #29
0
def clean_tweet(tweet):
    tweet = str(tweet)
    tweet = tweet.lower()
    # remove # so we preserve hashtags for the cloud
    tweet = tweet.replace("#", "")
    tweet = remove_links(tweet)
    tweet = remove_mentions(tweet)
    tweet = remove_emoji(tweet)
    tweet = remove_punctuations(tweet)
    tweet = remove_reserved_words(tweet)
    normalizer = Normalizer()
    tweet = normalizer.normalize(tweet)
    # replace arabic ي with persian
    tweet = tweet.replace('ي', 'ی')
    # removes verbs such as می‌شود or نمی‌گویند
    tweet = re.sub(r'ن?می[‌]\S+', '', tweet)
    tokens = word_tokenize(tweet)
    tokens = [token for token in tokens if not token.isdigit()]
    tokens = [token for token in tokens if token not in stopwords.persian]
    tokens = [token for token in tokens if token not in stopwords.english]
    return " ".join(tokens).strip()
Пример #30
0
    def prepare_text(text, should_stem=True):
        normalizer = Normalizer()
        text = normalizer.normalize(text)
        tokenized = word_tokenize(text)

        #نگارشی
        def fix_word(w):
            #            for c in Text_cleaner.punct_list:
            #                w = w.replace(c, '')
            w = re.sub(Text_cleaner.punct_regex, '', w).replace('،', '')
            return "$" if w == "" else w

        punc_free = list(filter(lambda x: x != '$', map(fix_word, tokenized)))
        stemmer = Stemmer()
        if should_stem:
            stemmed_list = list(
                filter(lambda x: x != '', map(stemmer.stem, punc_free)))
        else:
            stemmed_list = punc_free

        return stemmed_list
Пример #31
0
def clean_persianText(txt):
    normalizer = Normalizer()
    txt = normalizer.character_refinement(txt)
    txt = normalizer.affix_spacing(txt)
    txt = normalizer.punctuation_spacing(txt)
    txt = txt.replace('.', '')
    txt = normalizer.normalize(txt)
    return txt
Пример #32
0
    def process_text(self, text: str) -> Dict[str, int]:
        """
        Splits a long text into words.
        If `persian_normalize` attribute has been set to True, normalizes `text` with Hazm Normalizer.
        If `include_numbers` attribute has been set to False, removes all Persian, English and Arabic numbers from
        text`.
        Attention: this method will not remove stopwords from the input.
        :param text: The text we want to process
        :return: a dictionary. keys are words and values are the frequencies.
        """
        flags = (
            re.UNICODE if version < '3' and type(text) is unicode  # noqa: F821
            else 0)

        if self.remove_unhandled_utf_characters:
            text = WordCloudFa.unhandled_characters_regex.sub(r'', text)

        if self.persian_normalize:
            normalizer = Normalizer()
            text = normalizer.normalize(text)
        if not self.include_numbers:
            text = re.sub(r"[0-9\u06F0-\u06F9\u0660-\u0669]", "", text)

        if self.regexp:
            words = re.findall(self.regexp, text, flags)
        else:
            words = word_tokenize(text)

        if self.collocations:
            # We remove stopwords in the WordCloudFa, so there is no need for passing them in this function.
            word_counts = unigrams_and_bigrams(words, [],
                                               self.normalize_plurals,
                                               self.collocation_threshold)
        else:
            word_counts, _ = process_tokens(words, self.normalize_plurals)

        return word_counts
Пример #33
0
class Summarizer(object):
    def __init__(self):
        self.normalizer = Normalizer()

    def summarize(self, input):
        self.input = self.normalizer.normalize(input)
        self.base_words = word_tokenize(self.input)
        self.working_sentences = sent_tokenize(self.input)
        self.sentences_number = len(self.working_sentences)
        return self._get_summarize(num_sentences=self._find_num_sentences())

    def _find_num_sentences(self):
        return (int(math.log(self.sentences_number)**2 + 1) +
                1) if self.sentences_number >= 6 else self.sentences_number
        # return int(self.sentences_number - 0.2 * self.sentences_number)

    def _get_summarize(self, num_sentences):
        # if str(word not in stopwords.words()]
        words = [
            word for word in self.base_words
            if word not in stopwords.words('persian')
        ]
        word_frequencies = FreqDist(words)

        most_frequent_words = [
            pair[0] for pair in word_frequencies.items()[:100]
        ]

        actual_sentences = sent_tokenize(self.input)
        output_sentences = []

        for word in most_frequent_words:
            for i in range(0, len(self.working_sentences)):
                if (word in self.working_sentences[i]
                        and actual_sentences[i] not in output_sentences):
                    output_sentences.append(actual_sentences[i])
                    break
                if len(output_sentences) >= num_sentences:
                    break

            if len(output_sentences) >= num_sentences:
                break

        return self._reorder_sentences(output_sentences)

    def _reorder_sentences(self, output_sentences):
        output_sentences.sort(
            lambda s1, s2: self.input.find(s1) - self.input.find(s2))
        return output_sentences
Пример #34
0
    def __init__(self,
                 corpus_path='resources/corpus.json',
                 symbols_json_path='resources/symbols.json',
                 persian_lang_path='resources/persian_lang.json',
                 postagger_model_path='resources/postagger.model',
                 max_keyword_num=10, min_keyword_occurrences=0.01, expand_corpus=False):
        self.postagger_model_path = postagger_model_path
        self.symbols_json_path = symbols_json_path
        self.corpus_path = corpus_path
        self.corpus = {}
        self.docs_num = 0
        self.expand_corpus = expand_corpus

        if self.corpus_path is not None:
            with open(corpus_path, encoding='utf-8') as json_file:
                corpus = json.load(json_file)
            self.corpus = corpus['corpus']
            self.docs_num = corpus['docs_num']

        with open(symbols_json_path, encoding='utf-8') as json_file:
            data = json.load(json_file)
        lst = list(data.values())
        self.all_symbols_list = [item for sublist in lst for item in sublist]

        with open(persian_lang_path, encoding='utf-8') as json_file:
            persian_lang = json.load(json_file)

        self.epic_keywords = persian_lang['epic_keywords']
        self.punctuations = persian_lang['punctuations']
        self.persian_alphabet = persian_lang['persian_alphabet']
        self.stop_words = persian_lang['stop_words']

        self.tagger = POSTagger(model=self.postagger_model_path)
        self.normalizer = Normalizer()
        self.max_keyword_num = max_keyword_num
        self.min_keyword_occurrences = min_keyword_occurrences
Пример #35
0
from __future__ import unicode_literals
from hazm import Normalizer
from hazm import sent_tokenize, word_tokenize
from hazm import Stemmer, Lemmatizer
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from sklearn.linear_model import LogisticRegression
import numpy as np
stemmer = Stemmer()
normalizer = Normalizer()
################## define variables-----------------------------------------------
num_features=100
num_Of_epoch = 0
train_rate = 0.6
validate_rate = 0.1
sentences = []  # Initialize an empty list of sentences
mylabel = []    # labels for train sentences
#_________________________________________________________________________________
def train_test_seperator(data_path, label_path, train_rate = 0.6, validate_rate = 0.1):
    data_file = open(data_path, "r")
    label_file = open(label_path, "r")

    tmp_data = data_file.readlines()
    data_content = []
    for s in tmp_data:
        s = s.split("\n")
        s = s[0]
        s = s.split("\r")
        s = s[0]
        if(s == "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"):
            continue
Пример #36
0
from __future__ import unicode_literals
import os,sys,codecs
from hazm import Normalizer,sent_tokenize, word_tokenize


reader=codecs.open(os.path.abspath(sys.argv[1]),'r',encoding='utf-8')
writer=codecs.open(os.path.abspath(sys.argv[2]),'w',encoding='utf-8')

count=1
line=reader.readline()

normalizer = Normalizer()

while line:
	if count%1000==0:
		sys.stdout.write(str(count)+'...')

	if line.strip():
		n=normalizer.normalize(line.strip())
		tok=word_tokenize(n)
		sen=u' '.join(tok).replace('_',' ').replace('  ',' ').replace('  ',' ')
		l=sen+u'\n'
		writer.write(l)
	else:
		writer.write(u'\n')

	count+=1
	line=reader.readline()
sys.stdout.write('\n')
writer.flush()
writer.close()
Пример #37
0
from __future__ import unicode_literals
from hazm import Normalizer
from hazm import sent_tokenize, word_tokenize
from hazm import Stemmer, Lemmatizer
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from sklearn.linear_model import LogisticRegression
stemmer = Stemmer()
normalizer = Normalizer()


################## define variables-----------------------------------------------
num_features=100
num_Of_epoch = 10
sentences = []  # Initialize an empty list of sentences
mylabel = []    # labels for train sentences
#_________________________________________________________________________________
sentence_path = '/home/bero/Desktop/dataset/Persian Product Review Dataset/totaldata'
label_path = '/home/bero/Desktop/dataset/Persian Product Review Dataset/totallabel'

file_to_read = open(label_path, 'r')
labels = file_to_read.readlines()
mylabel = []
for line in labels:
    tmp = line.split('\n')
    mylabel.append(int(tmp[0]))
file_to_read.close()

file_to_read = open(sentence_path, 'r')
file_content = file_to_read.readlines()
file_to_read.close()
Пример #38
0
    def on_chat_message(self, msg):
        normalizer = Normalizer()
        #if msg.has_key(u'document'):
            #self.sender.downloadFile(msg[u'document'][u'file_id'], file_path="~/dl")
        m = msg['text'].split(' ')
        mr = msg['text']
        fn = msg['from']['first_name']
        chat_type = msg['chat']['type']
        user_id = msg['from']['id']
        r = ''
        if m[0] == u'/start':
            r = u'سلام به تو که اسمتو گذاشتی ' + unicode(fn)
        elif m[0] == u'mojose':
            r = msg
        if chat_type == 'private' and mr[:3] != u'هوی':
            mr = u'هوی ' + mr 
            m = mr.split(' ')
            
        if user_id == 170378225:
            #global ddd = {index, keyOf dd }
            global h_id
            global d
            global ddd
            global q2
            #get outputs from db
            if m[1] == u'g':
                #print 'g'
                try:
                    q = Hoy.select(User, Hoy).join(Chat).join(User).where(Hoy.hoy.contains(': 0')).get()
                    h_id = q.id
                    q2 = User.select().join(Chat).join(Hoy).where(Hoy.id==h_id)
                    d = ast.literal_eval(q.hoy)
                    o = ''
                    i = 0
                    d_iter = d.iteritems()
                    for (k, v) in (d_iter):
                        o += str(i)+' : '+k+' : '+str(v)+'\n'
                        i += 1
                    i = 0
                    d_k = d.keys()
                    dd = {}
                    for k in d_k:
                        dd[i] = k
                        i += 1
                    ddd = dd
                    inputs = ''
                    for i in q2:
                        inputs += i.user + '\n'
                    r = inputs+'\n-----------\n'+o
                    
                except:
                    r = 'چیزی برای تأیید نیست!'
            elif mr[4:6] == u'g\n':
                #print 'g2'
                mrc = mr[4:]
                mc = mrc.split('\n')
                user_input = mc[1]
                try:
                    q = Hoy.select(User, Hoy).join(Chat).join(User).where(User.user == user_input).get()
                    h_id = q.id
                    q2 = User.select(Hoy, User).join(Chat).join(Hoy).where(Hoy.id==h_id)
                    d = ast.literal_eval(q.hoy)
                    o = ''
                    i = 0
                    d_iter = d.iteritems()
                    for (k, v) in (d_iter):
                        o += str(i)+' : '+k+' : '+str(v)+'\n'
                        i += 1
                    i = 0
                    d_k = d.keys()
                    dd = {}
                    for k in d_k:
                        dd[i] = k
                        i += 1
                    ddd = dd
                    inputs = ''
                    for i in q2:
                        inputs += i.user + '\n'
                    r = inputs+'\n-----------\n'+o
                except:
                    r = 'نبود که!'
            elif mr[4:7] == 'gg\n':
                #print 'gg'
                mrr = mr[7:].replace(u'؟', u'').replace(u'.', u'').replace(u'!', u'').replace(u'می ', u'می').replace(u'می‌', u'می')
                mrr = normalizer.normalize(mrr)
                #print 'normalized user input:', mrr
                mm = mrr.split(' ')
                rgx = u''
                for w in mm:
                    rgx += w+'|'
                    if u'می' == w[:2] and u'‌' != w[2] and u' ' != w[2]:
                        rgx += u'می‌'+w[2:]+u'|'
                if len(mm) < 3:
                    rgx = u'(' + rgx[:-1] + u') '
                else:
                    rgx = u'(' + rgx[:-1] + u')? '
                rgx = rgx * len(mm)
                rgx = rgx[:-1]
                #print 'regex:', rgx
                try:
                    q = Chat.select(Chat, Hoy, User).join(User).switch(Chat).join(Hoy).where(User.user.regexp(rgx)).limit(10)
                    #print 'records founded (max 10):', len(q)
                    if len(q) == 0:
                        #try to fuzzy string and rematch
                        #print 'not found!'
                        raise
    
                    else:
                        n = 0
                        #rd = {n: ratio}
                        rd = {}
                        while n < len(q):
                            us = q[n].user.user
                            #print 'string founded: ', us
                            ratio = fuzz.ratio(us, mrr)
                            #print ratio
                            if ratio >= 50:
                                rd[n] = ratio
                            n += 1
                        #print rd
                        ho = ''
                        while len(ho) == 0:
                            maxn = max(rd.values())
                            n = rd.keys()[rd.values().index(maxn)]
                            hoo = q[n].hoy.hoy
                            #print 'founded a dict for', n
                            try:
                                ho = ast.literal_eval(hoo)
                                #print 'a valid dict:', ho
                                user_input = q[n].user.user
                                if 1 not in ho.values():
                                    #print 'this dict haven\'t any valid item'
                                    raise
                            except:
                                #print 'deleting', rd[n]
                                del rd[n]
                                #print 'deleted!'
                                ho = ''
                                user_input = ''
                except:
                    #print 'eee!'
                    pass
                try:
                    q = Hoy.select(User, Hoy).join(Chat).join(User).where(User.user == user_input).get()
                    h_id = q.id
                    q2 = User.select(Hoy, User).join(Chat).join(Hoy).where(Hoy.id==h_id)
                    d = ast.literal_eval(q.hoy)
                    o = ''
                    i = 0
                    d_iter = d.iteritems()
                    for (k, v) in (d_iter):
                        o += str(i)+' : '+k+' : '+str(v)+'\n'
                        i += 1
                    i = 0
                    d_k = d.keys()
                    dd = {}
                    for k in d_k:
                        dd[i] = k
                        i += 1
                    ddd = dd
                    inputs = ''
                    for i in q2:
                        inputs += i.user + '\n'
                    r = inputs+'\n-----------\n'+o
                except:
                    r = 'نبود که!'
                
            #review items
            elif m[1] == u'r':
                o = ''
                i = 0
                d_iter = d.iteritems()
                for (k, v) in (d_iter):
                    o += str(i)+' : '+k+' : '+str(v)+'\n'
                    i += 1
                i = 0
                d_k = d.keys()
                dd = {}
                for k in d_k:
                    dd[i] = k
                    i += 1
                ddd = dd
                inputs = ''
                for i in q2:
                    inputs += i.user + '\n'
                r = inputs+'\n-----------\n'+o 
            #commit changes
            elif m[1] == u'c':
                d_i = d.items()
                for k, v in d_i:
                    if v == 0:
                        del d[k]
                Hoy.update(hoy=d).where(Hoy.id == h_id).execute()
                d = {}
                ddd = {}
                inputs = ''
                r = 'تغییرات ذخیره شد!'
            #change state of an item
            elif len(m) == 2:
                try:
                    i = int(m[1])
                    if d[ddd[i]] == 0:
                        d[ddd[i]] = 1
                    else:
                        d[ddd[i]] = 0
                    r = ddd[i] + ' : ' + str(d[ddd[i]])
                except:
                    pass
            #if m[1] == 'grupoj':
                
            
        
        #TODO merge same outputs
        if '\n' in mr and u'\nبگو\n' in mr and r == '':
            mrc = normalizer.normalize(mr[4:])
            mc = mrc.split('\n')
            say_index = mc.index(u'بگو')
            user_inputs = mc[:say_index]
            hoy_outputs = mc[say_index+1:]
            hoy_outputs = {k:0 for k in hoy_outputs}
            hoy_outputs_old = {}
            for user_input in user_inputs:
                try:
                    H = (Hoy.select().join(Chat).join(User).where(User.user==user_input))
                    hoy_outputs_old = H[0].hoy
                    h_id = H[0].id
                    hoy_outputs_old = ast.literal_eval(hoy_outputs_old)
                    del user_inputs[user_inputs.index(user_input)]
                except:
                    pass
            if hoy_outputs_old == {}:
                h = Hoy.create(hoy=hoy_outputs)
                r = u'پاسخ‌های شما در صف بررسی قرار گرفت. تا ارباب چی بگن!'
            else:
                try:
                    hoy_outputs.update(hoy_outputs_old)
                    update_query = Hoy.update(hoy=hoy_outputs).where(Hoy.id==h_id)
                    update_query.execute()
                    h = Hoy.get(Hoy.id==h_id)
                    r = u'پاسخ‌های شما نیز در صف بررسی قرار گرفت. تا ارباب چی بگن!'
                except Exception as e:
                    pass
                    #print e
            try:
                for user_input in user_inputs:
                    u, created = User.get_or_create(user=user_input)
                    if created:
                        Chat.create(user=u, hoy=h)
            except Exception as e:
                pass
                #print e
        
        elif '\n' in mr and u'\nنفهم' in mr and r == '' and user_id == 170378225:
            mrc = mr[4:]
            mc = mrc.split('\n')
            say_index = mc.index(u'نفهم')
            user_input = mc[:say_index]
            try:
                dq = User.delete().where(User.user==user_input[0])
                dq.execute()
                r = u'اطاعت! دیگر به چنین چیزی پاسخ نمی‌دهم.'
                #TODO delete u_id that not exist in User, from Chat
            except:
                r = u'چنین چیزی وجود ندارد!'
                
                
                
                
        elif m[0] == u'هوی':
            if re.search(u'تخم|کیر|کسخل|کون|کون|الاغ|الاق|جنده|گای|پستون|ممه|گوز|شاش|جیش|قبحه|جلق|جق|سگ|جاکش|گائ|گاتو|کیون|لاشی|گامو|فاک|ساک|کُس|کوس|کوص|کص|سکس|پورن|الکسیس|گاشو', mr) \
            or re.search(u'(^| )رید(.|$)', mr) or u'خرم' in m or u'خری' in m or u'خره' in m or u'گا' in m or u'شق' in m or u'منی' in m or re.search(u'(^| )حشری(.|$)', mr):
                r = choice([u'بی‌ادب :|', u'بی‌تربیت :|', u'بی‌شخصیت :|',u'عفت کلام داشته باش یه ذره :|', u'دهنتو آب بکش :|'])
            #elif m[1] == u'سلام' or m[1] == u'درود':
                #r = choice([u'سلام', u'علیک سلام'])
            elif len(m) >= 3 and m[1] == u'بگو':
                r = normalizer.normalize(mr[8:])
            elif len(m) == 3:
                m2 = m[1]+' '+m[2]
                if m2 == u'چه خبر؟':
                    response = urllib2.urlopen('http://www.farsnews.com/RSS')
                    rss = response.read()
                    soup = BeautifulSoup.BeautifulSoup(rss)
                    all_title = soup.findAll('title')
                    def get_link(nth):
                        item = soup.findAll('item')[nth]
                        link = re.search(r'http://www.farsnews.com/(\d+)',unicode(item)).group(0)
                        return link
                    r = unicode(all_title[2]).replace('<title>', '<a href="%s">'%get_link(0), 2).replace('</title>', '</a>') + '\n\n' + \
                            unicode(all_title[3]).replace('<title', '<a href="%s"'%get_link(1), 2).replace('</title>', '</a>') + '\n\n' + \
                         unicode(all_title[4]).replace('<title', '<a href="%s"'%get_link(2), 2).replace('</title>', '</a>')
            elif len(m) == 2:
                if m[1] == u'راهنما':
                    r = u'• به این شکل هوی را آموزش دهید:\n\
\n\
سلام\n\
درود\n\
بگو\n\
علیک سلام\n\
سلام حاجی\n\
\n\
!> دقت کنید که در یک پیام و در خط‌های جدا باشد.\n\
\n\
!> اگر در گروه آموزشش می‌دهید، ابتدا هوی بنویسید و سپس مثل بالا خطوط را وارد کنید. این دو شکل قابل قبول است:\n\
\n\
هوی سلام\n\
درود\n\
بگو\n\
علیک سلام\n\
سلام حاجی\n\
---------\n\
هوی\n\
سلام\n\
درود\n\
بگو\n\
علیک سلام\n\
سلام حاجی\n\
\n\
• آموخته‌ها پس از تأیید به نمایش در می‌آیند.\n\
\n\
!> آموخته‌هایی که به اشخاص مربوط است و جنبهٔ عمومی ندارد، تأیید نمی‌شود.\n\
!> آموخته‌های شامل حرف بد، توهین و… تأیید نمی‌شود.\n\
!> آموخته‌های دارای اشتباه نوشتاری تأیید نمی‌شود.\n\
\n\
• اگر مثلاً «سلام» برای هوی تعریف شده باشد، می‌تواند این‌گونه از پاسخ‌های «سلام» برای «هلو» هم استفاده کند:\n\
\n\
سلام\n\
هلو\n\
بگو\n\
علیک\n\
های\n\
سلام عزیز\n\
\n\
• اگر پیشنهادی دارید، به @HSN6789 پیام بدهید.'
            
            if r == '':
                mrr = mr[4:].replace(u'؟', u'').replace(u'.', u'').replace(u'!', u'').replace(u'می ', u'می').replace(u'می‌', u'می')
                mrr = normalizer.normalize(mrr)
                #print 'normalized user input:', mrr
                mm = mrr.split(' ')
                rgx = u''
                for w in mm:
                    rgx += w+'|'
                    if u'می' == w[:2] and u'‌' != w[2] and u' ' != w[2]:
                        rgx += u'می‌'+w[2:]+u'|'
                if len(mm) < 3:
                    rgx = u'(' + rgx[:-1] + u') '
                else:
                    rgx = u'(' + rgx[:-1] + u')? '
                rgx = rgx * len(mm)
                rgx = rgx[:-1]
                #print 'regex:', rgx
                try:
                    q = Chat.select(Chat, Hoy, User).join(User).switch(Chat).join(Hoy).where(User.user.regexp(rgx)).limit(10)
                    #print 'records founded (max 10):', len(q)
                    if len(q) == 0:
                        #try to fuzzy string and rematch
                        #print 'not found!'
                        raise
    
                    else:
                        n = 0
                        #rd = {n: ratio}
                        rd = {}
                        while n < len(q):
                            us = q[n].user.user
                            #print 'string founded: ', us
                            ratio = fuzz.ratio(us, mrr)
                            #print ratio
                            if ratio >= 50:
                                rd[n] = ratio
                            n += 1
                        #print rd
                        ho = ''
                        while len(ho) == 0:
                            maxn = max(rd.values())
                            n = rd.keys()[rd.values().index(maxn)]
                            hoo = q[n].hoy.hoy
                            #print 'founded a dict for', n
                            try:
                                ho = ast.literal_eval(hoo)
                                #print 'a valid dict:', ho
                                if 1 not in ho.values():
                                    #print 'this dict haven\'t any valid item'
                                    raise
                            except:
                                #print 'deleting', rd[n]
                                del rd[n]
                                #print 'deleted!'
                                ho = ''
                        try:
                            outputs = []
                            for key in ho.keys():
                                if ho[key]==1:
                                    outputs.append(key)
                            r = normalizer.normalize(choice(outputs))
                            w = r.split(' ')
                            if u'می' == w[-1][:2] and u'‌' != w[-1][2] and u' ' != w[-1][2]:
                                w[-1] = u'می‌'+w[-1][2:]
                            r = ' '.join(w)
                        except:
                            r = ''
                    if r == '':
                        raise
                except Exception as e:
                    if re.search(u'(نظرت|نظر تو) (در مورد|درباره|دربارهٔ|درباره ی|درباره‌ی|راجع به|راجب) .* (چیست|چیه)', mr):
                        r = choice([u'در مورد همه چی باید نظر بدم؟!', u'نظر خاصی ندارم.', u'در این زمینه صاحب نظر نیستم.'])
                    elif re.search(u'؟$', mr):
                        r = choice([u'چرا می‌پرسی؟', u'نپرس!', u'نمی‌دونم.'])
                    elif re.search(u'!$', mr):
                        r = choice([u'عجب!', u'چه جالب!'])
                    elif re.search(u'\.$', mr):
                        r = choice([u'این که پایان جمله‌ت نقطه گذاشتی خیلی عالیه! ولی معنی جمله‌ت رو نمی‌فهمم. یادم بده.'])
                    else:   
                        r = u'نمی‌فهمم چی می‌گی. بیا خصوصی یادم بده!'
                    #print 'erorr:', e
                    #r = e
        if len(r) > 0:            
            self.sender.sendMessage(r,parse_mode='HTML')
Пример #39
0
        col_sums[col_sums.nonzero()]).sum() / len(col_sums[col_sums.nonzero()])
        row_sums = confusion_matrix.sum(1)
  #  recall = (
   #     confusion_matrix.diagonal()[row_sums.nonzero()] /
  #      row_sums[row_sums.nonzero()]).sum() / len(row_sums[row_sums.nonzero()])

    #print labels
    #print confusion_matrix
    return precision


if __name__ == '__main__':
    rd = HamshahriReader(config.corpora_root)
    counter = Counter()
    docs = []
    normalizer = Normalizer()
    stemmer = Stemmer()
    for doc in rd.docs(count=config.documents_count):
        doc['text'] = normalizer.normalize(doc['text'])
        doc['words'] = [stemmer.stem(word) for word in word_tokenize(doc['text'])]
        counter.update([doc['cat']])
        docs.append(doc)

    print counter
    all_words = []
    for doc in docs:
        all_words.extend(doc['words'])

    dist = nltk.FreqDist(word for word in all_words)

    word_features = dimension_reduction(all_words, dist)
Пример #40
0
def normalizefarsi(bot, update):
    normalizer = Normalizer()
    s = normalizer.normalize(update.message.text)
    bot.sendMessage(update.message.chat_id, text=s)
Пример #41
0
from hazm import sent_tokenize, word_tokenize, Normalizer, HamshahriReader, POSTagger, DependencyParser
from InformationExtractor import InformationExtractor
from progress.bar import Bar


hamshahri = HamshahriReader()
normalizer = Normalizer()
tagger = POSTagger()
parser = DependencyParser(tagger=tagger)
extractor = InformationExtractor()
texts = []

output = open('informations.txt', 'w')
for text in Bar(max=310000).iter(hamshahri.texts()):
	texts.append(normalizer.normalize(text))
	if len(texts) <= 1000: continue

	sentences = []
	for text in texts:
		for sentence in sent_tokenize(text):
			words = word_tokenize(sentence)
			if len(words) >= 3:
				sentences.append(words)
	texts = []

	tagged = tagger.batch_tag(sentences)
	parsed = parser.tagged_batch_parse(tagged)

	for sentence in parsed:
		# print('*', *[node['word'] for node in sentence.nodelist if node['word']], file=output)