Пример #1
0
    def score(self, sentences):
        # Predict
        pos, neg, neu = 0, 0, 0
        stemmer = Stemmer()
        classifier = self.__get_model()
        normalizer = Normalizer()

        sentences = sent_tokenize(sentences)

        for sentence in sentences:
            sentence = normalizer.normalize(sentence)
            words = word_tokenize(sentence)

            for word in words:
                stemmer.stem(word)
                class_result = classifier.classify(self.__word_feats(word))
                if class_result == 'neg':
                    neg = neg + 1
                if class_result == 'pos':
                    pos = pos + 1
                if class_result == 'neu':
                    neu = neu + 1

        positive_sentiment = str(float(pos) / len(words))
        # print('Positive: ' + positive_sentiment)
        neutral_sentiment = str(float(neu) / len(words))
        # print('Neutral: ' + neutral_sentiment)
        negative_sentiment = str(-float(neg) / len(words))
        # print('Negative: ' + negative_sentiment)

        total_sentiment = (float(positive_sentiment)+float(negative_sentiment)) / 2
        # print('Total (Avg): ' + str(total_sentiment))

        return total_sentiment
Пример #2
0
def stem_data(dat):
    normalizer = hazm.Normalizer()
    dat = normalizer.normalize(dat)
    sent = hazm.sent_tokenize(dat)

    words = []

    for s in sent:
        tagged = list(tagger.tag(hazm.word_tokenize(s)))
        new_tag = list(tagged)

        for token in tagged:
            if token[0] in stop_words:
                new_tag.remove(token)

        lemmatizer = hazm.Lemmatizer()
        for token in new_tag:

            stemmed = lemmatizer.lemmatize(token[0], pos=token[1])
            stemmer = hazm.Stemmer()
            stemmed = stemmer.stem(stemmed)
            if len(stemmed) > 0 and ('#' not in stemmed):
                words.append(stemmed)

    return words
 def __iter__(self):
     wiki = WikipediaReader(fawiki_dump=self.dump_file)
     for doc in wiki.docs():
         sentences = sent_tokenize(doc['text'])
         for sentence in sentences:
             # You should apply any preprocess before yield
             yield tokenizer.tokenize(sentence)
Пример #4
0
 def stremme(val):
     Log.logger.info('Data stemme by hazm package ')
     # words = [[stemmer.stem(word) for word in word_tokenize(sentence)] for sentence in sent_tokenize(val)]
     words = [[ps.run(word) for word in word_tokenize(sentence)] for sentence in sent_tokenize(val)]
     words = words[0]
     val = ' '.join(words)
     return val
Пример #5
0
    def _get_summarize(self, num_sentences):
        # if str(word not in stopwords.words()]
        words = [
            word for word in self.base_words
            if word not in stopwords.words('persian')
        ]
        word_frequencies = FreqDist(words)

        most_frequent_words = [
            pair[0] for pair in word_frequencies.items()[:100]
        ]

        actual_sentences = sent_tokenize(self.input)
        output_sentences = []

        for word in most_frequent_words:
            for i in range(0, len(self.working_sentences)):
                if (word in self.working_sentences[i]
                        and actual_sentences[i] not in output_sentences):
                    output_sentences.append(actual_sentences[i])
                    break
                if len(output_sentences) >= num_sentences:
                    break

            if len(output_sentences) >= num_sentences:
                break

        return self._reorder_sentences(output_sentences)
Пример #6
0
    def extract_metadata(self, tweet):
        important_words = []
        syms = []
        hashtags = []
        content_len = 0

        content = self.normalizer.normalize(tweet['content'])
        if 'های وب' in content: syms.append('های_وب')
        sentences = sent_tokenize(content)
        for sentence in sentences:
            sentence = sentence.translate(str.maketrans('', '', self.punctuations))

            words = word_tokenize(sentence)
            content_len += len(words)
            sent_syms, sent_hashs = self.get_symbols(words)
            syms += sent_syms
            hashtags += sent_hashs
            tags = self.tagger.tag(words)
            verbs = [word for (word, role) in tags if role == 'V']

            filtered_words = ([word.replace('#', '')
                               for word in words if word.replace('#', '') not in self.stop_words
                               and word.replace('#', '') not in verbs
                               and set(word.replace('#', '')).intersection(self.persian_alphabet)
                               and len(word.replace('#', '')) > 1])
            important_words += filtered_words
        syms = list(set(syms))
        hashtags = list(set(hashtags))
        bigrams = self.get_ngrams(important_words, 2)
        trigrams = self.get_ngrams(important_words, 3)
        candidate_words = hashtags + syms + important_words + bigrams + trigrams
        keywords = self.get_keywords(candidate_words, content_len)
        return keywords, syms, hashtags
Пример #7
0
def evaluate_summarizer(clf, dataset, used_features, remove_stopwords=False):

    rouge = Rouge()
    empty_score = {
        'rouge-1': {'p':0, 'f': 0, 'r': 0},
        'rouge-2': {'p':0, 'f': 0, 'r': 0},
        'rouge-l': {'p':0, 'f': 0, 'r': 0}
        }
    total_scores = {
        'rouge-1':{'p': 0, 'f': 0, 'r': 0},
        'rouge-2':{'p': 0, 'f': 0, 'r': 0},
        'rouge-l':{'p': 0, 'f': 0, 'r': 0}
        }
    avg_scores = empty_score
    total_summaries = 0
    #diff_summs = 0
    for key in dataset:
        total_summaries += 1
        text = dataset[key]['text']
        gold_summaries = dataset[key]['summaries']
        best_score = empty_score
        for ref_key in gold_summaries:
            ref = gold_summaries[ref_key]
            ref_len = len(hazm.sent_tokenize(ref))
            if remove_stopwords:
                ref = farsi.remove_stop_words_and_puncs(ref)
            summary = summ(text, clf, key[4:6], used_features, ref_len)

            lines = [s + "\n\n" for s in summary]
            summary = " ".join(summary)
            if remove_stopwords:
                summary = farsi.remove_stop_words_and_puncs(summary)
            #if len(summary) != len(ref):
            #    diff_summs += 1
            if len(summary) == 0:
                continue
            try:
                scores = rouge.get_scores(ref, summary)[0]
            except:
                print(ref)
                print(summary)
                o = 1
                o += 1
            """f_file = open('/tmp/summaries/' + ref_key + str(scores["rouge-1"]["f"]) + '.txt', '+w')
            f_file.writelines(lines)
            f_file.close()"""
            best_score = best_rouge_f(best_score, scores)
   
        for test_type in best_score:
            for param in best_score[test_type]:
                total_scores[test_type][param] += best_score[test_type][param]
    
    total_docs = len(dataset)
    for test_type in total_scores:
        for param in total_scores[test_type]:
            avg_scores[test_type][param] = total_scores[test_type][param]/total_summaries
    return avg_scores
Пример #8
0
 def s_normal(val):
     words = []
     for sentence in sent_tokenize(val):
         for word in word_tokenize(sentence):
             end = word.find('#')
             if end == -1:
                 end = len(word)
             words.append(word[:end])
     val = ' '.join(words)
     return val
Пример #9
0
def tokenize(paragraph, wanted_list):
    normal = Normalizer(remove_extra_spaces=True,
                        punctuation_spacing=True,
                        persian_style=False,
                        persian_numbers=False,
                        remove_diacritics=False,
                        affix_spacing=False,
                        token_based=False)
    for sentence in sent_tokenize(normal.normalize(paragraph)):
        wanted_list.append(sentence)
Пример #10
0
        def do_tokenize(text: str, **kwargs) -> typing.List[typing.List[Token]]:
            """Normalize, tokenize, and recognize part of speech"""
            sentences_tokens = []
            sentences = hazm.sent_tokenize(normalizer.normalize(text))
            for sentence in sentences:
                sentence_tokens = []
                for word, pos in tagger.tag(hazm.word_tokenize(sentence)):
                    sentence_tokens.append(Token(text=word, pos=pos))

                sentences_tokens.append(sentence_tokens)

            return sentences_tokens
Пример #11
0
def hazmtoalpheios(word,uri):
    wordslist = etree.Element("words")
    normalizer = Normalizer()
    data = normalizer.normalize(word)
    sentences = sent_tokenize(data)
    words = []
    for sentence in sentences:
        if words:
            words = words.append(word_tokenize(sentence))
        else:
            words = word_tokenize(sentence)
    analyses = []
    for item in words:
        stemmer = Stemmer()
        wordstem = stemmer.stem(item)
        lemmatizer = Lemmatizer()
        wordlema = lemmatizer.lemmatize(item)
        if '#' in wordlema:
            worldleam, garbage = wordlema.split("#")
        tagger = POSTagger(model=os.path.join(model_path,"postagger.model"))
        wordtagged = tagger.tag(item)
        wordpofs = wordtagged[0][1]
        wordpofs = maptohazm(wordpofs)
        # a better way to do this would be to create a Python class
        # to formalize the abstraction
        analysis = {}
        analysis['engine'] = 'hazm'
        analysis['uri'] = uri
        analysis['form'] = {}
        analysis['form']['text'] = item
        analysis['form']['lang'] = 'per'
        analysis['entries'] = []
        entry = {}
        entry['dict'] = {}
        entry['dict']['hdwd'] = {}
        entry['dict']['hdwd']['lang'] = 'per'
        entry['dict']['hdwd']['text'] = wordstem
        entry['infls'] = []
        infl = {}
        infl['stem'] = {} 
        infl['stem']['text'] = wordstem
        infl['stem']['lang'] = 'per'
        infl['pofs'] = {}
        if wordpofs:
            infl['pofs']['order'] = str(wordpofs[1])
            infl['pofs']['text'] = wordpofs[0]
        entry['infls'].append(infl)
        analysis['entries'].append(entry)
        analyses.append(analysis)
    return analyses
	def texts(self, categories={'Politics'}, limit=None):
		docs = self.hamshahri.docs()
		print 'start reading corpus...'
		count = 0
		texts = []
		for doc in docs:
			if limit is not None and count == limit:
				break
			if len(categories.intersection(set(doc["categories_en"]))) > 0:
				count += 1
				for sent in sent_tokenize(doc['text']):
					if len(sent) <= 1:
						continue
					texts.append([word for word in word_tokenize(sent) if word not in self.stopwords and len(word) > 1])
		return texts
Пример #13
0
    def readTrainTestFiles(self):
        """This function loads all train and test datas into list and preprocess them"""
        self.firstClassTrainList = []
        self.secondClassTrainList = []
        self.firstClassTestList = []
        self.secondClassTestList = []

        for fileName in self.firstClassTrainFiles:
            self.firstClassTrainList.append(
                self.preProcessing(open(fileName, 'r').read()))

        for fileName in self.secondClassTrainFiles:
            self.secondClassTrainList.append(
                self.preProcessing(open(fileName, 'r').read()))

        for fileName in self.firstClassTestFiles:
            sentences = hazm.sent_tokenize(open(fileName, 'r').read())
            for sentence in sentences:
                self.firstClassTestList.append(self.preProcessing(sentence))

        for fileName in self.secondClassTestFiles:
            sentences = hazm.sent_tokenize(open(fileName, 'r').read())
            for sentence in sentences:
                self.secondClassTestList.append(self.preProcessing(sentence))
Пример #14
0
def calculate_embeding(datatype):
    c = 0
    X_Word_embeding = []
    X_LSTM_1 = []
    X_LSTM_2 = []
    X_avg = []
    X_texts = []
    hd5_capacity = 1000
    with open('testdata/{0}/cleaned_captions.txt'.format(datatype)) as f:
        # with open('testdata/cluster_2.txt') as f:
        # with open('testdata/cleaned_captions.txt') as f:
        while True:
            c = c + 1
            print(c)
            # if c==50:
            #     break

            sample = f.readline()

            sents = sent_tokenize(sample)
            sents_tokens = [word_tokenize(sent) for sent in sents]
            try:
                word_encoder = e.sents2elmo(sents_tokens, 0)
                LSTM_hidden_1 = e.sents2elmo(sents_tokens, 1)
                LSTM_hidden_2 = e.sents2elmo(sents_tokens, 2)
                average_layers = e.sents2elmo(sents_tokens, -1)

                X_Word_embeding.append(
                    _calculate_caption_embeding(word_encoder))
                X_LSTM_1.append(_calculate_caption_embeding(LSTM_hidden_1))
                X_LSTM_2.append(_calculate_caption_embeding(LSTM_hidden_2))
                X_avg.append(_calculate_caption_embeding(average_layers))
                # X_texts.append(sample)

                if c % hd5_capacity == 0:

                    postfix = str(int(c / hd5_capacity))

                    _save_embeddings_parts(X_Word_embeding, X_LSTM_1, X_LSTM_2,
                                           X_avg, postfix)
                    X_Word_embeding = []
                    X_LSTM_1 = []
                    X_LSTM_2 = []
                    X_avg = []

            except ZeroDivisionError:
                print(sents_tokens)
                continue
Пример #15
0
def write_to_string(input_text, label6, label41):
    #output_string6 = ""
    #output_string41 = ""
    output_string = ""
    wo_tag_text = re.sub('<[^<]+?>', '', input_text)
    wo_tag_text = re.sub('&nbsp;', '', wo_tag_text)
    sent_list = hazm.sent_tokenize(wo_tag_text)
    for sent in sent_list:
        word_list = hazm.word_tokenize(sent)
        word_tokenize_sent = ""
        for word in word_list:
            word_tokenize_sent += word + ' '
        output_string += word_tokenize_sent + "\t"
    output_string6 = output_string + "__label__" + label6 + '\n'
    output_string41 = output_string + "__label__" + label41 + '\n'
    return output_string6, output_string41
def worker(identifier, skip, count):
    tagger = POSTagger()
    done = 0
    start = time.time()
    stopwords = load_stopwords()
    documents_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[
        Settings.HAMSHAHRI_DATABASE][Settings.HAMSHAHRI_COLLECTION]
    tags_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[
        Settings.TAGS_DATABASE][Settings.HAMSHAHRI_COLLECTION]

    batch_size = 50
    for batch in range(0, count, batch_size):
        hamshahri_cursor = documents_collection.find().skip(
            skip + batch).limit(batch_size)
        for doc in hamshahri_cursor:
            words = []
            sentences = sent_tokenize(doc['text'])
            sents = []
            for sentence in sentences:
                tokens = word_tokenize(sentence)
                text = [word for word in tokens if word not in stopwords]
                sents.append(text)

            tags = tagger.tag_sents(sents)
            for sent in tags:
                for word, tag in sent:
                    words.append({'word': word, "pos": tag})

            tags_collection.insert({
                "id": doc["id"],
                "categories_fa": doc["categories_fa"],
                "text": doc["text"],
                "words": words
            })

            done += 1
            #if done % 100 == 0:
            end = time.time()
            print 'Worker' + str(identifier) + ': Done ' + str(
                done) + ' out of ' + str(count) + ' in ' + (
                    "%.2f" %
                    (end - start)) + ' sec ~ ' + ("%.2f" %
                                                  (done /
                                                   (end - start))) + '/sec'
            sys.stdout.flush()
Пример #17
0
 def process(self, message: Message, **kwargs: Any) -> None:
     text = message.text
     for sentence_str in sent_tokenize(text):
         sentence = Sentence(sentence_str)
         tokens = word_tokenize(sentence_str)
         pos_tags = []
         if self.component_config.pos:
             pos_tags = self._pos_tagger.tag(tokens)
         for idx, token_str in enumerate(tokens):
             token = Token(text=token_str)
             if self.component_config.stemmer:
                 token[TOKEN_ATTRIBUTE_STEM] = self._stemmer.stem(token_str)
             if self.component_config.lemmatizer:
                 token[TOKEN_ATTRIBUTE_LEMM] = self._lemmatizer.lemmatize(
                     token_str)
             if self.component_config.pos:
                 token[TOKEN_ATTRIBUTE_POS] = pos_tags[idx][1]
             sentence.add_token(token)
         message.add_sentence(sentence)
Пример #18
0
def bow(text):
    global normalizer
    global tagger
    global stemmer
    global lemmatizer

    text = hz.sent_tokenize(normalizer.normalize(text))

    tagged = [tagger.tag(hz.word_tokenize(sent)) for sent in text]

    bag_of_words = defaultdict(int)
    for sentence in tagged:
        words = [
            lemmatizer.lemmatize(w[0]).split('#')[0]
            if w[1] is 'V' else stemmer.stem(str(w[0])) for w in sentence
        ]
        for w in words:
            bag_of_words[w] += 1

    return bag_of_words
Пример #19
0
def hazmtoalpheiosfile(data,uri):
    root = etree.Element("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF")    
    oaannotation = etree.SubElement(root,'{http://www.w3.org/ns/oa#}Annotation',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':'http://services.projectbamboo.org/morphology'+uri})
    oahasbody = etree.SubElement(oaannotation, '{http://www.w3.org/ns/oa#}hasBody',)
    oahastarget = etree.SubElement(oaannotation,'{http://www.w3.org/ns/oa#}hasTarget')
    hasbodydesc = etree.SubElement(oahastarget,'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':uri})
    ispartof = etree.SubElement(hasbodydesc,'{http://purl.org/dc/terms/}isPartOf',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':uri})
    source = etree.SubElement(hasbodydesc,'{http://purl.org/dc/terms/}source',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource':uri})
    title = etree.SubElement(oaannotation, '{http://purl.org/dc/elements/1.1/}title', {'{http://www.w3.org/XML/1998/namespace}lang':'eng'})
    title.text = "Morphology of " + uri
    wordslist = etree.SubElement("words")
    normalizer = Normalizer()
    data = normalizer.normalize(data)
    sentences = sent_tokenize(data)
    words = []
    for sentence in sentences:
        if words:
            words = words.append(word_tokenize(sentence))
        else:
            words = word_tokenize(sentence)
    for item in words:
        stemmer = Stemmer()
        wordstem = stemmer.stem(item)
        lemmatizer = Lemmatizer()
        wordlema = lemmatizer.lemmatize(item)
        if '#' in wordlema:
            worldleam, garbage = wordlema.split("#")
        tagger = POSTagger(model=os.path.join(model_path,"postagger.model"))
        wordtagged = tagger.tag(item)
        wordpofs = wordtagged[0][1]
        word = etree.SubElement(wordslist,'word')
        form = etree.SubElement(word, 'form', {'{http://www.w3.org/XML/1998/namespace}lang':'per'})
        form.text = item
        entry = etree.SubElement(word, 'entry')
        infl = etree.SubElement(entry,'inlf')
        term = etree.SubElement(infl, 'term', {'{http://www.w3.org/XML/1998/namespace}lang':'per'})
        stem = etree.SubElement(term, 'stem')
        stem.text = wordstem
        pofs = etree.SubElement(infl, 'pofs')
        pofs.text = wordpofs
    return root
Пример #20
0
def hazmtoalpheiosfile(data,uri):
    root = etree.Element("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF")    
    oaannotation = etree.SubElement(root,'{http://www.w3.org/ns/oa#}Annotation',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':'http://services.projectbamboo.org/morphology'+uri})
    oahasbody = etree.SubElement(oaannotation, '{http://www.w3.org/ns/oa#}hasBody',)
    oahastarget = etree.SubElement(oaannotation,'{http://www.w3.org/ns/oa#}hasTarget')
    hasbodydesc = etree.SubElement(oahastarget,'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':uri})
    ispartof = etree.SubElement(hasbodydesc,'{http://purl.org/dc/terms/}isPartOf',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':uri})
    source = etree.SubElement(hasbodydesc,'{http://purl.org/dc/terms/}source',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource':uri})
    title = etree.SubElement(oaannotation, '{http://purl.org/dc/elements/1.1/}title', {'{http://www.w3.org/XML/1998/namespace}lang':'eng'})
    title.text = "Morphology of " + uri
    wordslist = etree.SubElement("words")
    normalizer = Normalizer()
    data = normalizer.normalize(data)
    sentences = sent_tokenize(data)
    words = []
    for sentence in sentences:
        if words:
            words = words.append(word_tokenize(sentence))
        else:
            words = word_tokenize(sentence)
    for item in words:
        stemmer = Stemmer()
        wordstem = stemmer.stem(item)
        lemmatizer = Lemmatizer()
        wordlema = lemmatizer.lemmatize(item)
        if '#' in wordlema:
            worldleam, garbage = wordlema.split("#")
        tagger = POSTagger(model=os.path.join(model_path,"postagger.model"))
        wordtagged = tagger.tag(item)
        wordpofs = wordtagged[0][1]
        word = etree.SubElement(wordslist,'word')
        form = etree.SubElement(word, 'form', {'{http://www.w3.org/XML/1998/namespace}lang':'per'})
        form.text = item
        entry = etree.SubElement(word, 'entry')
        infl = etree.SubElement(entry,'inlf')
        term = etree.SubElement(infl, 'term', {'{http://www.w3.org/XML/1998/namespace}lang':'per'})
        stem = etree.SubElement(term, 'stem')
        stem.text = wordstem
        pofs = etree.SubElement(infl, 'pofs')
        pofs.text = wordpofs
    return root
Пример #21
0
    def text_to_tokens(
        self, text: str
    ) -> typing.Iterable[typing.Tuple[typing.List[str], typing.List[Token]]]:
        """
        Process text into words and sentence tokens using hazm.

        Returns: (original_words, sentence_tokens) for each sentence
        """

        try:
            import hazm
        except ImportError:
            _LOGGER.warning("hazm is highly recommended for language 'fa'")
            _LOGGER.warning("pip install 'hazm>=0.7.0'")

            # Fall back to parent implementation
            yield from super().text_to_tokens(text)

        # Load normalizer
        if not hasattr(self, "normalizer"):
            normalizer = hazm.Normalizer()
            setattr(self, "normalizer", normalizer)

        # Load tagger
        if not hasattr(self, "tagger"):
            # Load part of speech tagger
            model_path = self.lang_dir / "postagger.model"
            tagger = hazm.POSTagger(model=str(model_path))
            setattr(self, "tagger", tagger)

        sentences = hazm.sent_tokenize(normalizer.normalize(text))
        for sentence in sentences:
            original_words = []
            sentence_tokens = []
            for word, pos in tagger.tag(hazm.word_tokenize(sentence)):
                original_words.append(word)
                sentence_tokens.append(
                    Token(text=word,
                          features={TokenFeatures.PART_OF_SPEECH: pos}))

            yield original_words, sentence_tokens
Пример #22
0
def tok(dataTok):
    normalizer = Normalizer()
    tokenizer = WordTokenizer(join_verb_parts=False,
                              replace_links=True,
                              replace_IDs=True,
                              replace_numbers=True,
                              replace_hashtags=True)
    s = time.time()
    ij = 0
    #dataTok.apply (lambda x: dataTok1.append(sent_tokenize(x)) )

    for row in dataTok:
        _sents = sent_tokenize(row)
        _sents = stop_word(_sents)
        for _sent in _sents:
            _temp = _sent.replace(".", "").replace(",", "").replace(
                "،", "").replace("؛", "").strip()
            _wrds = []
            _wrds = normalizer.normalize(_temp)
            dataTok1.append(tokenizer.tokenize(_wrds))

    print("Data: ", dataTok1.__len__())
    e = time.time()
    print("Tokenize Done, Time: ", e - s, " !\n")
Пример #23
0
from wordfreq import zipf_frequency

if len(sys.argv) < 2:
    print('error')
    sys.exit()

raw_text = str(sys.argv[1])

normalizer_instance = Hazm.Normalizer()
lemmatizer_instance = Hazm.Lemmatizer()
stem_finder_instance = Hazm.Stemmer()
remove_non_persian_regex = re.compile('[^آ-ی]')
raw_text = remove_non_persian_regex.sub(
    ' ', raw_text)  #We replace all non persian texts
normalized_text = normalizer_instance.normalize(raw_text)
sentences = Hazm.sent_tokenize(normalized_text)

result_tokens = list()
less_accurate_tokens = list()


def add_to_tokens_if_not_exists(parsed_token):
    exists = False
    for result_token in result_tokens:
        if parsed_token == result_token:
            exists = True
            break
    if exists == False:  #Part four:Choose token based on frequency Or search and give score based on frequency
        freq = zipf_frequency(parsed_token, 'fa')
        if freq < 6:
            result_tokens.append(parsed_token)
Пример #24
0
    likes = int(unidecode(likes.replace(',', '')))

    disLike = int(unidecode(disLike.replace(',', '')))

    temp = {
        'comment': comment,
        'prod id': name,
        'price': price,
        'like': likes,
        'dislike': disLike,
        'date': date,
        'person': person,
        'buyer': buyer
    }

    for sent in hazm.sent_tokenize(CleanPersianText(comment)):
        comment = hazm.word_tokenize(sent)

        while len(comment) >= maxLenSent:
            temp['comment'] = ' '.join(comment[:maxLenSent])
            comment = comment[(maxLenSent - 2):]

            out = out.append(temp, ignore_index=True)
        if len(comment) > 3:
            temp['comment'] = ' '.join(comment)
            out = out.append(temp, ignore_index=True)

out.to_excel('cleaned1.xlsx')
nameDF.to_excel('Prod_spec1.xlsx')
Пример #25
0
    def get_answer(self, question, tokens, labels):
        answer = {
            'type': ['4'],
            'city': [],
            'date': [],
            'time': [],
            'religious_time': [],
            'calendar_type': [],
            'event': [],
            'api_url': [''],
            'result': []
        }
        generated_sentence = ""
        is_time_asked = False
        for t in time_asked:
            if t in question:
                is_time_asked = True

        if is_time_asked:
            return self.time.get_answer(question, tokens, labels)

        date_list = []
        date_list_jalali = []
        exportdate = export_date(question, tokens, labels, True)
        events = []
        which_date_is_event = []
        for i, d in enumerate(exportdate):
            if d[0]:
                date_list.append(d[0])
            if (not d[1][0]) and (not d[1][1]) and (type(d[1][2]) != bool):
                events.append(d[1][2])
                which_date_is_event.append(i)

        d_n = len(date_list)
        today = datetime.datetime.today()
        no_date = False
        if d_n == 0:
            date_list = [today]
            d_n = 1
            no_date = True
        date_list = unique_without_sort(date_list)
        d_n = len(date_list)
        date_list_jalali = []
        for d in date_list:
            j = gregorian_to_jalali(d.year, d.month, d.day)
            date_list_jalali.append(format_jalali_date(j))

        answer["date"] = date_list_jalali
        event_list = events
        answer["event"] = list(event_list)
        self.bii = concatenate_bi(tokens, labels, "B_DAT", "I_DAT")

        if no_date:
            answer["result"] = date_list_jalali
            generated_sentence = "امروز، {} است".format(
                tr_single_date(date_list[0], force_date=True))
        else:
            if d_n == 1:
                asingle, generated_sentence = self.get_single_answer(
                    question, answer, date_list, events)
                if asingle != None:
                    answer = asingle
                else:
                    answer["result"] = date_list_jalali
                    trsd = tr_single_date(date_list[0], True)
                    if self.bii:
                        if date_list[0].date() >= today.date():
                            generated_sentence = "{}، {} میباشد".format(
                                " ".join(self.bii), trsd)
                        else:
                            generated_sentence = "{}، {} بوده است".format(
                                " ".join(self.bii), trsd)
                    else:
                        if date_list[0].date() >= today.date():
                            generated_sentence = "تاریخ داده شده {} است".format(
                                trsd)
                        else:
                            generated_sentence = "تاریخ داده شده {} بوده".format(
                                trsd)
            else:
                answer["result"] = []
                tokenize_questions = hazm.sent_tokenize(question)
                if len(tokenize_questions) == 1:
                    tokenize_questions = question.split(" و ")
                if d_n == len(tokenize_questions):
                    generated_sentence = ""
                    if d_n != len(events):
                        s = 0
                        for i, (d, tk) in enumerate(
                                zip(date_list, tokenize_questions)):
                            if i in which_date_is_event:
                                n_answer, n_generated_sentence = self.get_single_answer(
                                    tk, answer, [d],
                                    [events[which_date_is_event[s]]],
                                    self.bii[i]
                                    if len(self.bii) == d_n else None)
                                s += 1
                            else:
                                n_answer, n_generated_sentence = self.get_single_answer(
                                    tk, answer, [d], None, self.bii[i]
                                    if len(self.bii) == d_n else None)
                            if n_answer != None:
                                answer = n_answer
                                if generated_sentence:
                                    generated_sentence = generated_sentence + " و " + n_generated_sentence
                                else:
                                    generated_sentence = n_generated_sentence
                            else:
                                n_answer, n_generated_sentence = self.get_single_answer(
                                    question, answer, [d], events, self.bii[i]
                                    if len(self.bii) == d_n else None)
                                if n_answer != None:
                                    answer = n_answer
                                    if generated_sentence:
                                        generated_sentence = generated_sentence + " و " + n_generated_sentence
                                    else:
                                        generated_sentence = n_generated_sentence
                                else:
                                    n_generated_sentence = "تاریخ داده شده {} میباشد".format(
                                        tr_single_date(d))

                                    j = gregorian_to_jalali(
                                        d.year, d.month, d.day)
                                    answer["result"].append(
                                        format_jalali_date(j))
                                    if generated_sentence:
                                        generated_sentence = generated_sentence + " و " + n_generated_sentence
                                    else:
                                        generated_sentence = n_generated_sentence
                    else:
                        for i in range(d_n):
                            n_answer, n_generated_sentence = self.get_single_answer(
                                tokenize_questions[i], answer, [date_list[i]],
                                [events[i]],
                                self.bii[i] if len(self.bii) == d_n else None)
                            if n_answer != None:
                                answer = n_answer
                                if generated_sentence:
                                    generated_sentence = generated_sentence + " و " + n_generated_sentence
                                else:
                                    generated_sentence = n_generated_sentence
                            else:
                                n_answer, n_generated_sentence = self.get_single_answer(
                                    question, answer, [date_list[i]],
                                    [events[i]], self.bii[i]
                                    if len(self.bii) == d_n else None)
                                if n_answer != None:
                                    answer = n_answer
                                    if generated_sentence:
                                        generated_sentence = generated_sentence + " و " + n_generated_sentence
                                    else:
                                        generated_sentence = n_generated_sentence
                                else:
                                    j = gregorian_to_jalali(
                                        date_list[i].year, date_list[i].month,
                                        date_list[i].day)
                                    answer["result"].append(
                                        format_jalali_date(j))
                                    n_generated_sentence = "تاریخ داده شده {} است".format(
                                        tr_single_date(date_list[i]))
                                    if generated_sentence:
                                        generated_sentence = generated_sentence + " و " + n_generated_sentence
                                    else:
                                        generated_sentence = n_generated_sentence

                else:
                    for d in date_list:
                        n_answer, n_generated_sentence = self.get_single_answer(
                            question, answer, [d], events,
                            self.bii[i] if len(self.bii) == d_n else None)
                        if n_answer != None:
                            answer = n_answer
                            if generated_sentence:
                                generated_sentence = generated_sentence + " و " + n_generated_sentence
                            else:
                                generated_sentence = n_generated_sentence
        return answer, cleaning(generated_sentence)
Пример #26
0
    def convertVWDataFormat(self, min, max):
        firstCounter = 0
        secondCounter = 0
        firstClassTrainSentences = []
        secondClassTrainSentences = []
        firstClassTestSentences = []
        secondClassTestSentences = []

        fileTrain = open("Train.txt", "w")
        fileTest = open("Test.txt", "w")

        for fileName in self.firstClassTrainFiles:
            sentences = hazm.sent_tokenize(open(fileName, 'r').read())
            for s in sentences:
                firstClassTrainSentences.append(s)
        for fileName in self.secondClassTrainFiles:
            sentences = hazm.sent_tokenize(open(fileName, 'r').read())
            for s in sentences:
                secondClassTrainSentences.append(s)

        for fileName in self.firstClassTestFiles:
            sentences = hazm.sent_tokenize(open(fileName, 'r').read())
            for s in sentences:
                firstClassTestSentences.append(s)
        for fileName in self.secondClassTestFiles:
            sentences = hazm.sent_tokenize(open(fileName, 'r').read())
            for s in sentences:
                secondClassTestSentences.append(s)

        while firstCounter < len(
                firstClassTrainSentences) and secondCounter < len(
                    secondClassTrainSentences):
            firstClassTrainSentences[firstCounter] = self.preProcessingVW(
                firstClassTrainSentences[firstCounter])
            secondClassTrainSentences[secondCounter] = self.preProcessingVW(
                secondClassTrainSentences[secondCounter])
            if len(firstClassTrainSentences[firstCounter]) >= 0 and len(
                    secondClassTrainSentences[secondCounter]) >= 0:
                fileTrain.write(
                    str(max) + " |" + firstClassTrainSentences[firstCounter] +
                    "\n")
                fileTrain.write(
                    str(min) + " |" +
                    secondClassTrainSentences[secondCounter] + "\n")
            firstCounter += 1
            secondCounter += 1

        firstCounter = 0
        secondCounter = 0

        while firstCounter < len(
                firstClassTestSentences) and secondCounter < len(
                    secondClassTestSentences):
            firstClassTestSentences[firstCounter] = self.preProcessingVW(
                firstClassTestSentences[firstCounter])
            secondClassTestSentences[secondCounter] = self.preProcessingVW(
                secondClassTestSentences[secondCounter])
            if len(firstClassTestSentences[firstCounter]) >= 0 and len(
                    secondClassTestSentences[secondCounter]) >= 0:
                fileTest.write(
                    str(max) + " |" + firstClassTestSentences[firstCounter] +
                    "\n")
                fileTest.write(
                    str(min) + " |" + secondClassTestSentences[secondCounter] +
                    "\n")
            firstCounter += 1
            secondCounter += 1
def normalize_user_text(text):
    pe = PersianEditor()
    text = pe.cleanup(text)
    my_text = put_space_punc(text)
    textSplited = sent_tokenize(my_text)
    return textSplited
Пример #28
0
def mallet(x):
    num_features_mallet = int(len(x.firstClassTrainDictionary) / 20)
    mallet_features = []

    first_class_sentences = []
    second_class_sentences = []

    for i in x.firstClassAllFiles:
        with open(i, "r") as fileName:
            sent = hazm.sent_tokenize(fileName.read())
            for i in sent:
                first_class_sentences.append(i)

    for i in x.secondClassAllFiles:
        with open(i, "r") as fileName:
            sent = hazm.sent_tokenize(fileName.read())
            for i in sent:
                second_class_sentences.append(i)

    proportionClasses = len(first_class_sentences) / len(
        second_class_sentences)
    print("nesbat : ",
          len(first_class_sentences) / len(second_class_sentences))

    for i in x.effectiveFeatures1stClass[0:int(num_features_mallet *
                                               proportionClasses /
                                               (1 + proportionClasses))]:
        mallet_features.append(i[0])
    for i in x.effectiveFeatures2stClass[0:int(num_features_mallet /
                                               (1 + proportionClasses))]:
        mallet_features.append(i[0])

    # print ( int (num_features_mallet /(1+proportionClasses) ) ,
    #             int (num_features_mallet*proportionClasses/(1+proportionClasses) ) )

    emam = open("emam.txt", "w")
    shah = open("shah.txt", "w")

    # create mallet format file and add features to them
    with open("mallet-2.0.8/mallet.txt", "w") as f:
        first_counter = 0
        second_counter = 0
        while first_counter < len(
                first_class_sentences) or second_counter < len(
                    second_class_sentences):
            if first_counter < len(first_class_sentences):
                f.write(str(first_counter) + " ")
                f.write("emam ")
                emam.write(
                    str(first_counter) + " " +
                    first_class_sentences[first_counter] + "\n")
                for j in mallet_features:
                    if j in first_class_sentences[first_counter]:
                        f.write(j)
                        f.write(" ")
                f.write("len:" +
                        str(len(first_class_sentences[first_counter])) + " ")
                f.write("hasNum:" +
                        str(hasNumbers(first_class_sentences[first_counter])) +
                        " ")

                f.write("\n")
                first_counter += 1

            if second_counter < len(second_class_sentences):
                f.write(str(second_counter) + " ")
                f.write("shah ")
                shah.write(
                    str(second_counter) + " " +
                    second_class_sentences[second_counter] + "\n")
                for j in mallet_features:
                    if j in second_class_sentences[second_counter]:
                        f.write(j)
                        f.write(" ")
                f.write("len:" +
                        str(len(second_class_sentences[second_counter])) + " ")
                f.write(
                    "hasNum:" +
                    str(hasNumbers(second_class_sentences[second_counter])) +
                    " ")

                f.write("\n")
                second_counter += 1
Пример #29
0
 def singel_char(val):
     words = [[word for word in word_tokenize(sentence) if len(word)>1] for sentence in sent_tokenize(val)]
     words = words[0]
     val = ' '.join(words)
     return val
Пример #30
0
    def remove_stop_words(val):
        Log.logger.info('Stop words removed')
        stops = Constant.STOP_WORDS
        words = [[word for word in word_tokenize(sentence) if word not in stops] for sentence in sent_tokenize(val)]
        words = words[0]

        val = ' '.join(words)
        return val
Пример #31
0
 def lemma(val):
     words = [[lemmatizer.lemmatize(word) for word in word_tokenize(sentence)] for sentence in sent_tokenize(val)]
     words = words[0]
     val = ' '.join(words)
     return val
Пример #32
0
def sentences(file="simple_text"):
    normalizer = Normalizer()
    for line in open(file, "r", encoding="utf-8").readlines():
        for sent in sent_tokenize(line):
            yield word_tokenize(line)
Пример #33
0

hamshahri = HamshahriReader()
normalizer = Normalizer()
tagger = POSTagger()
parser = DependencyParser(tagger=tagger)
extractor = InformationExtractor()
texts = []

output = open('informations.txt', 'w')
for text in Bar(max=310000).iter(hamshahri.texts()):
	texts.append(normalizer.normalize(text))
	if len(texts) <= 1000: continue

	sentences = []
	for text in texts:
		for sentence in sent_tokenize(text):
			words = word_tokenize(sentence)
			if len(words) >= 3:
				sentences.append(words)
	texts = []

	tagged = tagger.batch_tag(sentences)
	parsed = parser.tagged_batch_parse(tagged)

	for sentence in parsed:
		# print('*', *[node['word'] for node in sentence.nodelist if node['word']], file=output)
		for information in extractor.extract(sentence):
			print(*information, sep=' - ', file=output)
		print(file=output)
sample_set2 = readData(file2)

#Normalizind data
n = Normalizer()
for sample in sample_set1:
    sample = n.normalize(sample)

n = Normalizer()
for sample2 in sample_set2:
    sample2 = n.normalize(sample2)

#SENTENCE TOKENIZATION

all_sentences1 = []
for sample in sample_set1:
    sentences1 = sent_tokenize(sample)
    all_sentences1.extend(sentences1)
#print(all_sentences)

all_sentences2 = []
for sample in sample_set2:
    sentences2 = sent_tokenize(sample)
    all_sentences2.extend(sentences2)

size2 = all_sentences2.__len__()
size1 = all_sentences1.__len__()

for k in range(0, size1):
    fileo1.write(all_sentences1[k] + "\n")

for i in range(0, size2):