Пример #1
0
 def test_wordsent(self):
     text = u"""Tổng thống Nga coi việc Mỹ không kích căn cứ quân sự của Syria là "sự gây hấn nhằm vào một quốc gia có chủ quyền", gây tổn hại đến quan hệ Moscow-Washington."""
     word_tokenize(text)
Пример #2
0
def tokenizer(row):
    return word_tokenize(row, format="text")
Пример #3
0
            if label not in label_map:
                continue
            data_with_label[label_map[label]].append(content)

    x_train = []
    y_train = []
    x_test = []
    y_test = []

    for label, contents in data_with_label.items():
        contents_length = len(contents)
        separate_index = (contents_length * 2) // 3
        for index, content in enumerate(contents):
            # for content in contents:
            content = content.lower()
            words = word_tokenize(content)
            new_words = list(map(lambda word: '_'.join(word.split(' ')),
                                 words))
            content_after_handling = ' '.join(new_words)
            if index <= separate_index:
                x_train.append(content_after_handling)
                y_train.append(label)
            else:
                x_test.append(content_after_handling)
                y_test.append(label)
    # x_test = []
    print('Start training')
    (count_vectorizer, tf_idf_transformer, x_train_counts,
     x_train_tf_idf) = calculate_tf_idf(x_train)
    # with open('sentiment_analysis_test.v1.0.txt') as file:
    #     for line in file:
Пример #4
0
def clean_data(data):
    sentences = word_tokenize(data)
    sentences = [
        stemmer.stem(w.lower()) for w in sentences if w not in stopwords
    ]  #pre process
    return sentences
def wordless_word_tokenize(main,
                           text,
                           lang,
                           word_tokenizer='default',
                           keep_sentences=False):
    tokens_sentences = []

    if lang not in main.settings_global['word_tokenizers']:
        lang = 'other'

    if word_tokenizer == 'default':
        word_tokenizer = main.settings_custom['word_tokenization'][
            'word_tokenizers'][lang]

    if keep_sentences:
        wordless_text_utils.check_tokenizers(main,
                                             lang=lang,
                                             word_tokenizer=word_tokenizer)
    else:
        wordless_text_utils.check_word_tokenizers(
            main, lang=lang, word_tokenizer=word_tokenizer)

    if 'NLTK' in word_tokenizer:
        sentences = wordless_sentence_tokenize(main, text, lang)

        if word_tokenizer == main.tr('NLTK - Penn Treebank Tokenizer'):
            treebank_tokenizer = nltk.TreebankWordTokenizer()

            for sentence in sentences:
                tokens_sentences.append(treebank_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Twitter Tokenizer'):
            tweet_tokenizer = nltk.TweetTokenizer()

            for sentence in sentences:
                tokens_sentences.append(tweet_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - NIST Tokenizer'):
            nist_tokenizer = nltk.tokenize.nist.NISTTokenizer()

            for sentence in sentences:
                tokens_sentences.append(nist_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Tok-tok Tokenizer'):
            toktok_tokenizer = nltk.ToktokTokenizer()

            for sentence in sentences:
                tokens_sentences.append(toktok_tokenizer.tokenize(sentence))

        if not keep_sentences:
            tokens_sentences = [
                itertools.chain.from_iterable(tokens_sentences)
            ]
    elif 'Sacremoses' in word_tokenizer:
        if keep_sentences:
            sentences = wordless_sentence_tokenize(main, text, lang)
        else:
            sentences = [text]

        if word_tokenizer == main.tr('Sacremoses - Moses Tokenizer'):
            moses_tokenizer = sacremoses.MosesTokenizer(
                lang=wordless_conversion.to_iso_639_1(main, lang))

            for sentence in sentences:
                tokens_sentences.append(
                    moses_tokenizer.tokenize(sentence, escape=False))
        elif word_tokenizer == main.tr('Sacremoses - Penn Treebank Tokenizer'):
            moses_tokenizer = sacremoses.MosesTokenizer(
                lang=wordless_conversion.to_iso_639_1(main, lang))

            for sentence in sentences:
                tokens_sentences.append(
                    moses_tokenizer.penn_tokenize(sentence))
    elif 'spaCy' in word_tokenizer:
        nlp = main.__dict__[f'spacy_nlp_{lang}']
        doc = nlp(text)
        # See Issue #3479: https://github.com/explosion/spaCy/issues/3479
        doc.is_parsed = True

        if keep_sentences:
            for sentence in doc.sents:
                tokens_sentences.append(
                    [token.text for token in sentence.as_doc()])
        else:
            tokens_sentences.append([token.text for token in doc])

    # Chinese & Japanese
    elif ('jieba' in word_tokenizer or 'nagisa' in word_tokenizer
          or 'Wordless' in word_tokenizer):
        if keep_sentences:
            sentences = wordless_sentence_tokenize(main, text, lang=lang)
        else:
            sentences = [text]

        # Chinese
        if word_tokenizer == main.tr('jieba - Chinese Word Tokenizer'):
            for sentence in sentences:
                tokens_sentences.append(jieba.cut(sentence))
        elif word_tokenizer == main.tr(
                'Wordless - Chinese Character Tokenizer'):
            for sentence in sentences:
                tokens = []
                non_han_start = 0

                for i, char in enumerate(sentence):
                    if i >= non_han_start:
                        if wordless_checking_unicode.is_han(char):
                            tokens.append(char)

                            non_han_start += 1
                        else:
                            # English
                            if wordless_checking_unicode.is_eng(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_eng(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='eng'))

                                        non_han_start = i + j + 1

                                        break
                            # Other Languages
                            else:
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or wordless_checking_unicode.is_han(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='other'))

                                        non_han_start = i + j + 1

                                        break

                tokens_sentences.extend(tokens)
        # Japanese
        elif word_tokenizer == main.tr('nagisa - Japanese Word Tokenizer'):
            import nagisa

            for sentence in sentences:
                tokens_sentences.append(nagisa.tagging(str(sentence)).words)
        elif word_tokenizer == main.tr('Wordless - Japanese Kanji Tokenizer'):
            for sentence in sentences:
                tokens = []
                non_han_start = 0

                for i, char in enumerate(sentence):
                    if i >= non_han_start:
                        if wordless_checking_unicode.is_han(char):
                            tokens.append(char)

                            non_han_start += 1
                        else:
                            # Japanese Kana
                            if wordless_checking_unicode.is_kana(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_kana(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='jpn'))

                                        non_han_start = i + j + 1

                                        break
                            # English
                            elif wordless_checking_unicode.is_eng(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_eng(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='eng'))

                                        non_han_start = i + j + 1

                                        break
                            # Other Languages
                            else:
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or wordless_checking_unicode.is_han(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='other'))

                                        non_han_start = i + j + 1

                                        break

                tokens_sentences.extend(tokens)
    # Thai
    elif 'PyThaiNLP' in word_tokenizer:
        sentences = wordless_sentence_tokenize(
            main,
            text,
            lang='tha',
            sentence_tokenizer='PyThaiNLP - Thai Sentence Tokenizer')

        if word_tokenizer == main.tr(
                'PyThaiNLP - Maximum Matching Algorithm + TCC'):
            for sentence in sentences:
                tokens_sentences.append(
                    pythainlp.tokenize.word_tokenize(sentence, engine='newmm'))
        elif word_tokenizer == main.tr(
                'PyThaiNLP - Maximum Matching Algorithm'):
            for sentence in sentences:
                tokens_sentences.append(
                    pythainlp.tokenize.word_tokenize(sentence, engine='mm'))
        elif word_tokenizer == main.tr('PyThaiNLP - Longest Matching'):
            for sentence in sentences:
                tokens_sentences.append(
                    pythainlp.tokenize.word_tokenize(
                        sentence, engine='longest-matching'))
    # Tibetan
    elif 'pybo' in word_tokenizer:
        if keep_sentences:
            sentences = wordless_sentence_tokenize(main, text, lang='bod')
        else:
            sentences = [text]

        if word_tokenizer == main.tr('pybo - Tibetan Word Tokenizer (GMD)'):
            for sentence in sentences:
                tokens_sentences.append([
                    token.text
                    for token in main.pybo_tokenizer_gmd.tokenize(sentence)
                ])
        elif word_tokenizer == main.tr('pybo - Tibetan Word Tokenizer (POS)'):
            for sentence in sentences:
                tokens_sentences.append([
                    token.text
                    for token in main.pybo_tokenizer_pos.tokenize(sentence)
                ])
        elif word_tokenizer == main.tr(
                'pybo - Tibetan Word Tokenizer (tsikchen)'):
            for sentence in sentences:
                tokens_sentences.append([
                    token.text for token in
                    main.pybo_tokenizer_tsikchen.tokenize(sentence)
                ])
    # Vietnamese
    elif word_tokenizer == main.tr('Underthesea - Vietnamese Word Tokenizer'):
        if keep_sentences:
            sentences = wordless_sentence_tokenize(
                main,
                text,
                lang='vie',
                sentence_tokenizer='Underthesea - Vietnamese Sentence Tokenizer'
            )
        else:
            sentences = [text]

        for sentence in sentences:
            tokens_sentences.append(underthesea.word_tokenize(str(sentence)))

    # Remove empty tokens and strip whitespace
    for i, tokens in enumerate(tokens_sentences):
        tokens_sentences[i] = [
            token.strip() for token in tokens if token.strip()
        ]

    # Record token boundaries
    if lang in ['zho_cn', 'zho_tw', 'jpn']:
        for tokens in tokens_sentences:
            if tokens:
                tokens[-1] = wordless_text.Wordless_Token(tokens[-1],
                                                          boundary='',
                                                          sentence_ending=True)
    else:
        for tokens in tokens_sentences:
            if tokens:
                tokens[-1] = wordless_text.Wordless_Token(tokens[-1],
                                                          boundary=' ',
                                                          sentence_ending=True)

    return tokens_sentences
def funcOpenFileTrainCheckPost():
    file_path = os.path.join(script_dir, "train_NLTK/checkwords_vn_train.txt")
    f = codecs.open(file_path, "r", "utf8")
    keywords_for_check = f.read()
    ArrCheck = word_tokenize(keywords_for_check)
    print(str(ArrCheck) + "\n")
Пример #7
0
words = []
classes = []
documents = []

fileName = "../process/StopWords"
file_Stop_word = open(fileName, "r", encoding="utf-8")
stopWords = set()
for line in file_Stop_word:
    line = line.strip("\n")
    stopWords.add(line)

ignore_words = list(stopWords)

for intent in intents['intents']:
    for question in intent['questions']:
        w = word_tokenize(question)
        words.extend(w)
        documents.append((w, intent['tag']))
        if intent['tag'] not in classes:
            classes.append(intent['tag'])

words = [w.lower() for w in words if w not in ignore_words if len(w) != 1]
words = sorted(list(set(words)))

classes = sorted(list(set(classes)))

pickle.dump(words, open('../deploy/words.pkl', 'wb'))
pickle.dump(classes, open('../deploy/classes.pkl', 'wb'))
pickle.dump(documents, open('../deploy/documents.pkl', 'wb'))
pickle.dump(ignore_words, open('../deploy/ignore_words.pkl', 'wb'))
Пример #8
0
def preprocessing(s, show_stepbystep=False):
    if show_stepbystep:
        print("original:")
        print(s)
        print()

    # remove 'Xem thêm'
    s = re.sub('Xem thêm', '', s)
    if show_stepbystep:
        print("remove 'Xem thêm':")
        print(s)
        print()

    # convert to lower case
    s = s.lower()
    if show_stepbystep:
        print("lowercase:")
        print(s)
        print()

    # abbreviate some names
    s = re.sub('kỹ thuật phần mềm', 'ktpm', s)
    s = re.sub('công nghệ phần mềm', 'cnpm', s)
    s = re.sub('khoa học máy tính', 'khmt', s)
    s = re.sub('hệ thống thông tin', 'httt', s)
    s = re.sub('kỹ thuật máy tính', 'ktmt', s)
    s = re.sub('thương mại điện tử', 'tmđt', s)
    s = re.sub('công nghệ thông tin', 'cntt', s)
    s = re.sub('an toàn thông tin', 'attt', s)
    s = re.sub('công tác sinh viên', 'ctsv', s)
    s = re.sub('ban học tập', 'bht', s)
    if show_stepbystep:
        print("abbreviate faculty name:")
        print(s)
        print()

    # remove urls and hashtags
    s = re.sub(r'http\S+', '', s)
    s = re.sub(r'#\S+', '', s)
    if show_stepbystep:
        print('remove urls and hashtags:')
        print(s)
        print()

    # remove email address
    s = re.sub(r'\S*@\S*\s?', '', s)
    if show_stepbystep:
        print('remove email addresses:')
        print(s)
        print()

    # split into words
    tokens = word_tokenize(s)
    if show_stepbystep:
        print('tokenize:')
        print(tokens)
        print()

    # remove punctuation and number
    words = [word for word in tokens if re.sub(r"\s+", "", word).isalpha()]
    if show_stepbystep:
        print('remove punctuation:')
        print(words)
        print()

    # remove consecutive duplicates character
    words = [removeConsecutiveDuplicates(word) for word in words]
    if show_stepbystep:
        print('remove consecutive duplicates character:')
        print(words)
        print()

    # replace abbreviation of word
    words = [replace_abbr(word) for word in words]
    if show_stepbystep:
        print('replace abbreviation:')
        print(words)
        print()

    # replace " " with "_"
    words = [re.sub(r"\s+", "_", word) for word in words]
    if show_stepbystep:
        print('replace space with "_" :')
        print(words)
        print()

    # remove single character
    words = [word for word in words if len(word) > 1]
    if show_stepbystep:
        print('remove single character:')
        print(words)
        print()

    return ' '.join(words)
Пример #9
0
    for sentences in file:
        corpus.append(transform_row(sentences))
for sentences in corpus:
    W = None
    Text = sentences
    ListSentence.append(Text)
    W = ViWordSegment()
    W.parseword()

words = list(set(words))
words.sort()
X = np.zeros([len(words), len(words)])
for sentences in corpus:
    # tương tu cũng loai bo stopword tung cau
    tokens = []
    for word in word_tokenize(sentences):
        NewWord = word.replace('.', '').replace(',', '').strip()
        if NewWord != '':
            if not (NewWord in StopWordsInput):
                tokens.append(NewWord.lower())

data = []
for sentences in corpus:

    tokens = []
    for word in word_tokenize(sentences):
        NewWord = word.replace('.', '').replace(',', '').strip()
        if NewWord != '':
            if not (NewWord in StopWordsInput):
                tokens.append(NewWord.lower())
    data.append(tokens)
Пример #10
0
#XỬ LÝ REQUEST_STATUS 1:
time = [year]
for i in numSeeker(sw_remover(standardize(datve['date'],0))):
  time.append(i)
print(time)
for j in timeSeeker(sw_remover(standardize(datve['time'],2))):
  time.append(j)
if len(time)==6:
  d = datetime.datetime(int(time[0]),int(time[1]),int(time[2]),int(time[3]),int(time[4]),int(time[5]))
  timestamp = d.replace(tzinfo=timezone.utc).timestamp()
  seats = numSeeker(sw_remover(standardize(datve['seats'],2)))

  request01 = {
      "status": 0,
      "data": {
          "pickupAddress":placecorrect(word_tokenize(sw_remover(standardize(datve['pickupAddress'],1)))),
          "takeoffAddress":placecorrect(word_tokenize(sw_remover(standardize(datve['takeoffAddress'],1)))),
          "time": timestamp,
          "seats": seats[0]
      }
  }
  print(request01)
else:
  print("error")




### HÀM XỬ LÝ PHẢN ÁNH ###
def abbriviateCorrect(mss): #xử lý viết tắt
  final = ''
Пример #11
0
def buil_new_model():
    with open('deploy/intents.json', encoding="utf-8") as json_data:
        intents = json.load(json_data)

    words = []
    classes = []
    documents = []

    fileName = "process/StopWords"
    file_Stop_word = open(fileName, "r", encoding="utf-8")
    stopWords = set()
    for line in file_Stop_word:
        line = line.strip("\n")
        stopWords.add(line)

    ignore_words = list(stopWords)

    for intent in intents['intents']:
        for question in intent['questions']:
            w = word_tokenize(question)
            words.extend(w)
            documents.append((w, intent['tag']))
            if intent['tag'] not in classes:
                classes.append(intent['tag'])

    words = [w.lower() for w in words if w not in ignore_words if len(w) != 1]
    words = sorted(list(set(words)))

    classes = sorted(list(set(classes)))

    pickle.dump(words, open('deploy/words.pkl', 'wb'))
    pickle.dump(classes, open('deploy/classes.pkl', 'wb'))
    pickle.dump(documents, open('deploy/documents.pkl', 'wb'))
    pickle.dump(ignore_words, open('deploy/ignore_words.pkl', 'wb'))

    dataset = []
    output = []

    output_empty = [0] * len(classes)

    for doc in documents:
        bag = []
        # print(doc)
        question_words = doc[0]
        question_words = [
            word.lower() for word in question_words if word not in ignore_words
            if len(word) != 1
        ]
        # print(question_words)
        for w in words:
            if w in question_words:
                bag.append(1)
            else:
                bag.append(0)

        output_row = list(output_empty)
        output_row[classes.index(doc[1])] = 1
        dataset.append([bag, output_row])

        # print(bag)
    random.shuffle(dataset)
    len_dataset = len(dataset)

    len_train = int(len_dataset * 0.75)

    training = dataset[0:len_train]
    testing = dataset[len_train:len_dataset]

    training = np.array(training)
    testing = np.array(testing)

    train_x = list(training[:, 0])
    test_x = list(testing[:, 0])

    train_y = list(training[:, 1])
    test_y = list(testing[:, 1])

    model = Sequential()
    model.add(Dense(128, input_shape=(len(train_x[0]), ), activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(len(train_y[0]), activation='softmax'))

    model.summary()
    model.compile(loss='categorical_crossentropy',
                  optimizer='sgd',
                  metrics=['acc'])
    history = model.fit(np.array(train_x),
                        np.array(train_y),
                        epochs=10000,
                        batch_size=64)
    model_path = "deploy/model_h3d.h5"
    model.save(model_path)

    # Evaluate the model on the test data using `evaluate`
    print("Evaluate on test data")
    results = model.evaluate(np.array(test_x), np.array(test_y), batch_size=64)
    print("test loss, test acc:", results)
Пример #12
0
    des = doc["Description"]
    news_link = doc["NewspaperLink"]

    ### Kiểm tra bài báo đã được đẩy vào Elasticsearch
    ### Vì dữ liệu đẩy lên Elasticsearch được tokenize bởi standard tokenizer 1 lần nữa
    ### nên đoạn code dưới không còn chính xác với dữ liệu text. Nhưng dữ liệu số thì được
    # result_ck = es.search(
    #     index="my-index",
    #     body={"query": {"bool": {"must": {"term": {"NewspaperLink": news_link}}}}},
    # )
    # if result_ck["hits"]["total"]["value"] > 0:
    # continue

    ### Tokenize
    ### Tokenize title
    title_tokenized = underthesea.word_tokenize(title, format="text")
    ### Tokenize description
    des_tokenized = underthesea.word_tokenize(des,
                                              format="text").replace("\n", "")

    ### Tokenize content
    content_tokenized = []
    for ct in content:
        if ct["type"] == "text":
            paragraph = ct["content"]
            sentences = underthesea.sent_tokenize(paragraph)

            paragraph_tokenized = ""
            for sentence in sentences:
                ### Tokenize sentence in paragraph
                sentence = underthesea.word_tokenize(sentence, format="text")
Пример #13
0
from underthesea import pos_tag
from underthesea import word_tokenize

sen = 'kjkhigyf yêu cầu phát triển và ứng dụng công nghệ thông tin trong sản xuất kinh doanh và quản lý hướng tới mục tiêu nâng cao toàn diện năng lực cạnh tranh quốc gia coi đây là con đường ngắn nhất để Việt Nam tiến kịp các nước phát triển tiến cùng thời đại'
seg = word_tokenize(sen)
tag = pos_tag(sen)

tag2 = [i[1] for i in tag]

print('done')






Пример #14
0
 def auto_tags(self):
     self.tokens = word_tokenize(self.text)
def func_DataAnalysis():
    func_pushcontent()
    file_path_file2 = os.path.join(script_dir, "file2.txt")
    file_path_pl_train = os.path.join(script_dir, "train_NLTK/pl_train.txt")
    file_path_checkwords = os.path.join(script_dir,
                                        "train_NLTK/checkwords_vn_train.txt")
    file_path_stopwords = os.path.join(script_dir,
                                       "train_NLTK/vn_stopwords.txt")

    # content post raw and luot bo cac ki tu ngoai chu va so
    f = codecs.open(file_path_file2, "r", "utf8")
    text_raw = f.read()
    # bien cac ki tu dac biet thanh " "
    text = re.sub(r"\W+|_", " ", text_raw)
    # print("*Text raw: "+text_raw)
    # print("\nNoi Dung Chinh: ")
    tokens = word_tokenize(text)
    # print(text)
    # --
    # tim kiem ngon ngu lap trinh trong post
    programming_language = []
    company_email = re.findall(r'\S+@\S+', text_raw)
    # strip loai bo cac ki tu phia ngoai cung
    link_post = ((re.findall(r'(https?://www.facebook.com/[^\s]+)',
                             text_raw))[0]).strip(',"')
    print("link_post: " + link_post)
    job_position_check = [
        "Senior", "Fresher", "Intern", "Junior", "Tester", "Dev",
        "Software Test Intern", "Software Test Fresher"
    ]
    company_syn = ["công ty", "cty"]
    job_position = []
    f = open(file_path_pl_train, "r")
    p_languges_raw = f.read()
    p_languges = p_languges_raw.split()
    # niceword = word_tokenize(analy1)
    for i in company_syn:
        for j in range(len(tokens)):
            if (i.lower() == tokens[j].lower()):
                company_name = tokens[j + 1]
                break
            else:
                if company_email != None:
                    try:
                        company_name = (((
                            company_email[0].split('.'))[0]).split('@'))[1]
                    except:
                        company_name = tokens[0]
                else:
                    company_name = tokens[0]
        if (i.lower() == tokens[j].lower()):
            break

    for i in tokens:
        for j in job_position_check:
            if (i.lower() == j.lower()):
                if (j not in job_position):
                    job_position.append(j)
    for i in text.split():
        for j in p_languges:
            if (i.lower() == j.lower()):
                if (j not in programming_language):
                    programming_language.append(j)

    print("--Company name: " + str(company_name))
    print("--Post nay nhac den cac nn lap trinh: " + str(programming_language))
    print("--Gmail company: " + str(company_email))
    print("--Link post: " + str(link_post))
    print("--Vi tri can tuyen: " + str(job_position))

    # print("--Desc: "+text)
    x = ''
    y = ''
    z = ''
    for i in programming_language:
        x = x + i + ", "
    programminglanguage = x.strip(", ")
    for i in list(set(company_email)):
        y = y + i + ", "
    companyemail = y.strip(", ")
    for i in job_position:
        z = z + i + ", "
    jobposition = z.strip(", ")

    # --
    # print("ARR: "+str(tokens))  # day la noi dung file input da split

    # canh bao post co phai spam khong

    f = codecs.open(file_path_checkwords, "r", "utf8")
    text_check = f.read()
    Arr_check = word_tokenize(text_check)
    # print("ARR_check mang nay kiem tra day co phai post spam k ?: "+str(Arr_check))
    alert = 1
    for i in tokens:
        for j in Arr_check:
            if (i.lower() == j.lower()):
                alert = 0
    # --
    # luot nhung stopwords trong content cua post de thong ke sach hon
    f = codecs.open(file_path_stopwords, "r", "utf8")
    vnstopwords = f.read()
    vn_sw = vnstopwords.splitlines()
    clean_tokens = tokens[:]
    # bintrash = []
    for token in tokens:
        if token in vn_sw:
            clean_tokens.remove(token)
            # bintrash.append(token)

    print("--Length clean_tokens : " + str(len(clean_tokens)))
    print("--Length tokens : " + str(len(tokens)))
    # print("\n--Show clean token : "+str(clean_tokens))
    # print("\n--Show Bin : "+str(bintrash))

    # thong ke so luong tu sau khi luot stopwords trong post

    # freq = nltk.FreqDist(clean_tokens)

    # ve bieu do sau khi luot stopwords trong post
    # freq.plot(20, cumulative=False)

    # freq = nltk.FreqDist(tokens) # thong ke so luong tu day du trong post
    # for key,val in freq.items():
    #     print(str(key) + ':' + str(val))
    # --
    # print("Data push: "+company_name+" "+companyemail+" "+programminglanguage+" "+jobposition+" "+link_post+" "+text+" "+str(alert))
    database(company_name.capitalize(), companyemail, programminglanguage,
             jobposition, link_post, text.replace('description ', ''), alert)
    print("-------alert: " + str(alert))
    if (alert != 0):
        print("WARNING!-Day co kha nang cao la post khong lien quan!\n")
    else:
        print("Khong co canh bao nao!")
    f.close()
Пример #16
0

if __name__ == "__main__":
    input = "./data/data_ver03.txt"
    output = "./data/data_offical.txt"
    content = None
    with open(input, "r") as f:
        content = f.read()

    content = normalize_content(content)

    content = change_teen_code(content)

    content = remove_emoji(content)

    content = remove_other_language(content)

    # word tokenize again.
    content = content.replace("_", " ").split("\n")

    result = []
    for sent in content:
        words = word_tokenize(sent)
        words = [x.replace(" ", "_") for x in words if x != ""]
        result.append(" ".join(words))

    result = "\n".join(result)

    with open(output, "w") as f:
        f.write(result)
def newfunc_DataAnalysis():
    t1 = time.time()
    file_path = os.path.join(script_dir, "testnewmyfile20.txt")
    file_path_pl_train = os.path.join(script_dir, "train_NLTK/pl_train.txt")
    file_path_checkwords = os.path.join(script_dir,
                                        "train_NLTK/checkwords_vn_train.txt")
    f = codecs.open(file_path, "r", "utf8")
    txtjson = f.read()
    try:
        objjson = json.loads(txtjson)
    except:
        print("File json loi kiem tra lai!")
        quit()
    f.close()
    job_position_check = [
        "Senior", "Fresher", "Intern", "Junior", "Tester", "Dev",
        "Software Test Intern", "Software Test Fresher"
    ]
    company_syn = ["công ty", "cty"]
    job_position = []
    programming_language = []
    count = 0
    for i in range(len(objjson)):
        link_post = objjson[i]['post_url']
        desc_insertdb = remove_emojis(objjson[i]['description'])
        desc_analysis = re.sub(r"\W+|_", " ", desc_insertdb)
        tokens = word_tokenize(desc_analysis)
        # print(tokens)
        f = open(file_path_pl_train, "r")
        p_languges_raw = f.read()
        p_languges = p_languges_raw.split()
        f.close()
        company_email = re.findall(r'\S+@\S+', desc_insertdb)
        for i in company_syn:
            for j in range(len(tokens)):
                if (i.lower() == tokens[j].lower()):
                    company_name = tokens[j + 1]
                    break
                else:
                    if company_email != None:
                        try:
                            company_name = (((
                                company_email[0].split('.'))[0]).split('@'))[1]
                        except:
                            company_name = tokens[0]
                    else:
                        company_name = tokens[0]
            if (i.lower() == tokens[j].lower()):
                break
        for i in tokens:
            for j in job_position_check:
                if (i.lower() == j.lower()):
                    if (j not in job_position):
                        job_position.append(j)
        for i in desc_analysis.split():
            for j in p_languges:
                if (i.lower() == j.lower()):
                    if (j not in programming_language):
                        programming_language.append(j)
        # chuyen mang thanh chuoi push vao db
        x = ''
        y = ''
        z = ''
        for i in programming_language:
            x = x + i + ", "
        programminglanguage = x.strip(", ")
        for i in list(set(company_email)):
            y = y + i + ", "
        companyemail = y.strip(", ")
        for i in job_position:
            z = z + i + ", "
        jobposition = z.strip(", ")
        # kiem tra co phai post spam khong
        f = codecs.open(file_path_checkwords, "r", "utf8")
        text_check = f.read()
        Arr_check = word_tokenize(text_check)
        alert = 1
        for i in tokens:
            for j in Arr_check:
                if (i.lower() == j.lower()):
                    alert = 0
        # push vao db
        print("------" + str(count + 1) + "------")
        print("Company mail : " + companyemail + "| Company name : " +
              company_name.capitalize() + "| PL : " + programminglanguage +
              "| Job position : " + jobposition + "| Status: " + str(alert))
        database(company_name.capitalize(), companyemail, programminglanguage,
                 jobposition, link_post, desc_insertdb, alert)
        count = count + 1
    noti = "Xu ly " + str(
        len(objjson)) + " post thanh cong " + str(count) + " post."
    print(noti)
    t2 = time.time()
    processingtime = "processing time: {:.3f}".format(t2 - t1)
    print(processingtime)
    history_path = os.path.join(script_dir, "history.txt")
    history = codecs.open(history_path, "a", "utf8")
    dt = datetime.datetime.now()
    history.write(noti + " || " + processingtime + " || " + dt.strftime("%x") +
                  " " + dt.strftime("%X"))
    history.write("\n")
    history.close()
Пример #18
0
def receive_message():
    global intention
    intention = 0
    global loc_apply_flag

    if request.method == 'GET':
        """Before allowing people to message your bot, Facebook has implemented a verify token
        that confirms all requests that your bot receives came from Facebook."""
        if (request.args.get('hub.verify_token') == VERIFY_TOKEN):
            return request.args.get('hub.challenge')
    #if the request was not get, it must be POST and we can just proceed with sending a message back to user
    else:
        # get whatever message a user sent the bot
        output = request.get_json()
        print('__check output variable:', output)
        messenger.handle(request.get_json(force=True))
        inten_flow(intention)
        for event in output['entry']:
            messaging = event['messaging']
            for message in messaging:
                if message.get('message'):
                    #Facebook Messenger ID for user so we know where to send response back to
                    if message['message'].get('text'):
                        print('__check message variable:', message)
                        if 'quick_reply' in message['message']:
                            text = {
                                'text':
                                message['message']['quick_reply']['payload']
                            }

                            inten_flow(
                                execute_flow(
                                    message['message']['quick_reply']
                                    ['payload'], intention))
                            text['quick_replies'] = quick_replies.to_dict()
                            messenger.send(text, 'RESPONSE')
                        else:
                            # print('message type: ', type(message['message'].get('text')))
                            # response_message = repOfficial(message['message'].get('text'))

                            # text = {'text': response_message}
                            # # # print("TEXT: ", text)
                            # # text = {'text': 'A message Hi'}
                            # # # text['quick_replies'] = quick_replies.to_dict()
                            # # messenger.send('Bot say: {0}'.format(text['text']), 'RESPONSE')
                            # # text = {'text': 'A Message'}
                            # # text['quick_replies'] = quick_replies.to_dict()
                            # messenger.send(text, 'RESPONSE')

                            message_text = message['message'].get('text')
                            list_of_out_of_work = [
                                'cmnd', 'chứng minh', 'hộ khẩu', 'KT1', 'KT2',
                                'KT3'
                            ]
                            list_of_say_hello = [
                                'hello', 'hi', 'chào', 'aloha', 'morning'
                            ]
                            message_check = word_tokenize(message_text)
                            for i in range(len(message_check)):
                                if message_check[i] in list_of_out_of_work:
                                    str_bot_rep = "Xin lỗi bạn, tôi chỉ có thể hỗ trợ bạn về vấn đề hộ chiếu, những thủ tục liên quan khác. \
                                    Bạn xin chờ tính năng phát triển tiếp theo."

                                    text = {'text': str_bot_rep}
                                    messenger.send(text, 'RESPONSE')
                                    return ''
                                elif message_check[i] in list_of_say_hello:
                                    str_bot_rep = "Chào bạn, tôi là Chatbot hỗ trợ bạn với những thủ tục cơ bản khi làm hộ chiếu lần đầu.\
                                    Nếu bạn có thắc mắc gì về những việc cần làm khi làm hộ chiếu lần đầu thì cứ hỏi tôi."

                                    text = {'text': str_bot_rep}
                                    messenger.send(text, 'RESPONSE')
                                    return ''

                            bot_rep = []
                            userIntent = ic_predict(message_text)[0]
                            print("user's intent: ", userIntent)
                            print('loc_apply_flag: ', loc_apply_flag)

                            if loc_apply_flag == True:
                                locQuantity, locApply = getNerName(
                                    'Làm hộ chiếu ở ' + message_text)
                                print('locApply: ', locApply)
                                for i in range(len(locQuantity)):
                                    bot_rep.append(getLocApply(locApply[i]))
                                loc_apply_flag = False
                            else:
                                if userIntent == "where_loc_apply":
                                    if isAnyLOC(message_text) == False:
                                        text = {
                                            'text': 'Bạn đang ở tỉnh thành: '
                                        }
                                        messenger.send(text, 'RESPONSE')
                                        loc_apply_flag = True
                                        return ''
                                    else:
                                        locQuantity, locApply = getNerName(
                                            message_text)
                                        print(locApply)
                                        if locApply == []:
                                            bot_rep = getLocApply(locApply)
                                        else:
                                            for i in range(len(locQuantity)):
                                                bot_rep.append(
                                                    getLocApply(locApply[i]))
                                else:
                                    bot_rep = normalRep(userIntent)

                            str_bot_rep = ''
                            for i in range(len(bot_rep)):
                                str_bot_rep = str_bot_rep + bot_rep[i]

                            text = {
                                'text': str_bot_rep
                            }  #We must add 'text' variable like this or it can not send to Messenger.
                            print('text: ', text)
                            messenger.send(text, 'RESPONSE')
                            print('message sent')

    return ''
Пример #19
0
def wl_word_tokenize(main,
                     text,
                     lang,
                     word_tokenizer='default',
                     flat_tokens=True):
    tokens_multilevel = []

    if lang not in main.settings_global['word_tokenizers']:
        lang = 'other'

    if word_tokenizer == 'default':
        word_tokenizer = main.settings_custom['word_tokenization'][
            'word_tokenizers'][lang]

    # Check initialization status of word (and sentence) tokenizers
    if flat_tokens:
        wl_text_utils.check_word_tokenizers(main,
                                            lang=lang,
                                            word_tokenizer=word_tokenizer)
    else:
        wl_text_utils.check_tokenizers(main,
                                       lang=lang,
                                       word_tokenizer=word_tokenizer)

    # NLTK
    if 'NLTK' in word_tokenizer:
        sentences = wl_sentence_tokenization.wl_sentence_tokenize(
            main, text, lang)

        if word_tokenizer == main.tr('NLTK - NIST Tokenizer'):
            nist_tokenizer = nltk.tokenize.nist.NISTTokenizer()

            for sentence in sentences:
                tokens_multilevel.append(nist_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - NLTK Tokenizer'):
            nltk_tokenizer = nltk.NLTKWordTokenizer()

            for sentence in sentences:
                tokens_multilevel.append(nltk_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Penn Treebank Tokenizer'):
            treebank_tokenizer = nltk.TreebankWordTokenizer()

            for sentence in sentences:
                tokens_multilevel.append(treebank_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Tok-tok Tokenizer'):
            toktok_tokenizer = nltk.ToktokTokenizer()

            for sentence in sentences:
                tokens_multilevel.append(toktok_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Twitter Tokenizer'):
            tweet_tokenizer = nltk.TweetTokenizer()

            for sentence in sentences:
                tokens_multilevel.append(tweet_tokenizer.tokenize(sentence))
    # Sacremoses
    elif 'Sacremoses' in word_tokenizer:
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                main, text, lang)

        moses_tokenizer = sacremoses.MosesTokenizer(
            lang=wl_conversion.to_iso_639_1(main, lang))

        for sentence in sentences:
            tokens_multilevel.append(
                moses_tokenizer.tokenize(sentence, escape=False))

    # spaCy
    elif 'spaCy' in word_tokenizer:
        nlp = main.__dict__[f'spacy_nlp_{lang}']
        doc = nlp(text)
        # See Issue #3479: https://github.com/explosion/spaCy/issues/3479
        doc.is_parsed = True

        if flat_tokens:
            tokens_multilevel.append([token.text for token in doc])
        else:
            for sentence in doc.sents:
                tokens_multilevel.append(
                    [token.text for token in sentence.as_doc()])
    # syntok
    elif word_tokenizer == 'syntok - Word Tokenizer':
        syntok_tokenizer = syntok.tokenizer.Tokenizer()

        if flat_tokens:
            tokens_multilevel.append(
                [token.value for token in syntok_tokenizer.tokenize(text)])
        else:
            for para in syntok.segmenter.analyze(text):
                for sentence in para:
                    tokens_multilevel.append(
                        [token.value for token in sentence])
    # Chinese & Japanese
    elif ('jieba' in word_tokenizer or 'nagisa' in word_tokenizer
          or 'Wordless' in word_tokenizer):
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                main, text, lang=lang)

        # Chinese
        if word_tokenizer == main.tr('jieba - Chinese Word Tokenizer'):
            for sentence in sentences:
                tokens_multilevel.append(jieba.cut(sentence))
        elif word_tokenizer == main.tr(
                'Wordless - Chinese Character Tokenizer'):
            for sentence in sentences:
                tokens = []
                non_han_start = 0

                for i, char in enumerate(sentence):
                    if i >= non_han_start:
                        if wl_checking_unicode.is_han(char):
                            tokens.append(char)

                            non_han_start += 1
                        else:
                            # English
                            if wl_checking_unicode.is_eng(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wl_checking_unicode.is_eng(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wl_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='eng'))

                                        non_han_start = i + j + 1

                                        break
                            # Other Languages
                            else:
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or wl_checking_unicode.is_han(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wl_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='other'))

                                        non_han_start = i + j + 1

                                        break

                tokens_multilevel.append(tokens)
        # Japanese
        elif word_tokenizer == main.tr('nagisa - Japanese Word Tokenizer'):
            import nagisa

            for sentence in sentences:
                tokens_multilevel.append(nagisa.tagging(str(sentence)).words)
        elif word_tokenizer == main.tr('Wordless - Japanese Kanji Tokenizer'):
            for sentence in sentences:
                tokens = []
                non_han_start = 0

                for i, char in enumerate(sentence):
                    if i >= non_han_start:
                        if wl_checking_unicode.is_han(char):
                            tokens.append(char)

                            non_han_start += 1
                        else:
                            # Japanese Kana
                            if wl_checking_unicode.is_kana(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wl_checking_unicode.is_kana(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wl_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='jpn'))

                                        non_han_start = i + j + 1

                                        break
                            # English
                            elif wl_checking_unicode.is_eng(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wl_checking_unicode.is_eng(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wl_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='eng'))

                                        non_han_start = i + j + 1

                                        break
                            # Other Languages
                            else:
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or wl_checking_unicode.is_han(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wl_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='other'))

                                        non_han_start = i + j + 1

                                        break

                tokens_multilevel.append(tokens)
    # Russian
    elif word_tokenizer == 'razdel - Russian Word Tokenizer':
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                main, text, lang='rus')

        for sentence in sentences:
            tokens_multilevel.append(
                [token.text for token in razdel.tokenize(sentence)])
    # Thai
    elif 'PyThaiNLP' in word_tokenizer:
        # Preserve sentence boundaries
        sentences = wl_sentence_tokenization.wl_sentence_tokenize(main,
                                                                  text,
                                                                  lang='tha')

        if word_tokenizer == main.tr('PyThaiNLP - Longest Matching'):
            for sentence in sentences:
                tokens_multilevel.append(
                    pythainlp.word_tokenize(sentence, engine='longest'))
        elif word_tokenizer == main.tr('PyThaiNLP - Maximum Matching'):
            for sentence in sentences:
                tokens_multilevel.append(
                    pythainlp.word_tokenize(sentence, engine='mm'))
        elif word_tokenizer == main.tr('PyThaiNLP - Maximum Matching + TCC'):
            for sentence in sentences:
                tokens_multilevel.append(
                    pythainlp.word_tokenize(sentence, engine='newmm'))
        elif word_tokenizer == main.tr(
                'PyThaiNLP - Maximum Matching + TCC (Safe Mode)'):
            for sentence in sentences:
                tokens_multilevel.append(
                    pythainlp.word_tokenize(sentence, engine='newmm-safe'))
    # Tibetan
    elif 'botok' in word_tokenizer:
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                main, text, lang='bod')

        for sentence in sentences:
            tokens_multilevel.append([
                token.text
                for token in main.botok_word_tokenizer.tokenize(sentence)
            ])
    # Vietnamese
    elif word_tokenizer == main.tr('Underthesea - Vietnamese Word Tokenizer'):
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                main,
                text,
                lang='vie',
                sentence_tokenizer='Underthesea - Vietnamese Sentence Tokenizer'
            )

        for sentence in sentences:
            tokens_multilevel.append(underthesea.word_tokenize(str(sentence)))

    # Remove empty tokens and strip whitespace
    for i, sentence in enumerate(tokens_multilevel):
        tokens_multilevel[i] = [
            token.strip() for token in sentence if token.strip()
        ]

    # Record token boundaries
    if lang in ['zho_cn', 'zho_tw', 'jpn']:
        for sentence in tokens_multilevel:
            if sentence:
                sentence[-1] = wl_text.Wl_Token(sentence[-1],
                                                boundary='',
                                                sentence_ending=True)
    else:
        for sentence in tokens_multilevel:
            if sentence:
                sentence[-1] = wl_text.Wl_Token(sentence[-1],
                                                boundary=' ',
                                                sentence_ending=True)

    # Clause tokenization
    if not flat_tokens:
        for i, sentence in enumerate(tokens_multilevel):
            tokens_multilevel[i] = wl_sentence_tokenization.wl_clause_tokenize(
                main, sentence, lang)

    # Flatten tokens
    tokens_flat = list(wl_misc.flatten_list(tokens_multilevel))

    if flat_tokens:
        return tokens_flat
    else:
        return tokens_multilevel
Пример #20
0
def evaluate_classify_model():
    test_df = pd.read_csv('data/test-more-info.txt',
                          sep='\t',
                          header=None,
                          names=['id', 'origin_q', 'compare_q', 'label', 'score_elastic'],
                          )
    test_df['predict'] = 0.01

    doc2vec_model = Doc2Vec.load('gensim/model/question.d2v')
    classify_model = load_model('model/simple_classify_model.h5')

    for index, row in test_df.iterrows():
        origin_q = row['origin_q']
        compare_q = row['compare_q']

        origin_q_vector = doc2vec_model.infer_vector(simple_preprocess(word_tokenize(origin_q, format='text')))
        compare_q_vector = doc2vec_model.infer_vector(simple_preprocess(word_tokenize(compare_q, format='text')))
        concat_vector = np.concatenate((origin_q_vector, compare_q_vector))
        arr_wraper = np.array([concat_vector])

        test_df.at[index, 'predict'] = classify_model.predict(arr_wraper)[0][0]

    test = test_df.loc[test_df['id'] == 22972022]
    test_sort = test.sort_values(by='predict', ascending=False).reset_index(drop=True)

    id_queries = []
    for id_query in test_df['id']:
        if id_query not in id_queries:
            id_queries.append(id_query)

    mAP_df = pd.DataFrame(data=id_queries, columns=['id'])

    score_AP_model_alls = []
    score_AP_model_top10 = []
    score_AP_elastic_alls = []
    score_AP_elastic_top10 = []
    for id_query in mAP_df['id']:
        group_id = test_df.loc[test_df['id'] == id_query]

        # Caculate mAP model
        group_predict_sort = group_id.sort_values(
            by='predict',
            ascending=False).reset_index(drop=True)

        AP_model_all = convenion.caculate_AP(group_predict_sort['label'])
        AP_model_top10 = convenion.caculate_AP(group_predict_sort['label'][:10])

        score_AP_model_alls.append(AP_model_all)
        score_AP_model_top10.append(AP_model_top10)

        # Caculate mAP elastic search
        group_elastic_sort = group_id.sort_values(
            by='score_elastic',
            ascending=False).reset_index(drop=True)
        AP_elastic_all = convenion.caculate_AP(group_elastic_sort['label'])
        AP_elastic_top10 = convenion.caculate_AP(group_elastic_sort['label'][:10])

        score_AP_elastic_alls.append(AP_elastic_all)
        score_AP_elastic_top10.append(AP_elastic_top10)

    mAP_df['AP_model_all'] = score_AP_model_alls
    mAP_df['AP_model_top10'] = score_AP_model_top10

    mAP_df['AP_elastic_all'] = score_AP_elastic_alls
    mAP_df['AP_elastic_top10'] = score_AP_elastic_top10

    print('mAP elastic all: ', sum(score_AP_elastic_alls) / len(score_AP_elastic_alls))
    print('mAP model all: ', sum(score_AP_model_alls) / len(score_AP_model_alls))
    print('mAP elastic top10: ', sum(score_AP_elastic_top10) / len(score_AP_elastic_top10))
    print('mAP model top10: ', sum(score_AP_model_top10) / len(score_AP_model_top10))

    return mAP_df
Пример #21
0
#load data

file = open('vietnamese-stopwords.txt', 'r')
stopwords = file.readlines()

words = []
documents = []
classes = []

import json
with open('data.json') as json_data:
    intents = json.load(json_data)

for intent in intents['intents']:
    for pattern in intent['patterns']:
        w = word_tokenize(pattern)
        words.extend(w)
        print(intent['tag'])
        documents.append((w, intent['tag']))
        if intent['tag'] not in classes:
            classes.append(intent['tag'])
words = [stemmer.stem(w.lower()) for w in words if w not in stopwords]
words = sorted(list(set(words)))

classes = sorted(list(set(classes)))


def clean_data(data):
    sentences = word_tokenize(data)
    sentences = [
        stemmer.stem(w.lower()) for w in sentences if w not in stopwords
Пример #22
0
def search_tf_idf(category, tags_list):
    with open(TMP_PATH + 'Top_20_keyword_' + str(category) + '_tf_idf.csv',
              'w',
              encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=tags_list)
        writer.writeheader()
        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        page = es.search(
            index='baomoi.com',
            doc_type='doc',
            scroll='2m',
            # size = 100, #number of hits to return
            body={
                "query": {
                    "match_phrase": {
                        "categories": {
                            "query": category
                        }
                    }
                }
            })

        sid = page['_scroll_id']
        scroll_size_ = page['hits']['total']
        print(category)
        scroll_size = 1
        hits = page['hits']['hits']
        cnt = Counter()
        stop_word = [
            'bị', 'bởi', 'cả', 'các', 'cái', 'cần', 'càng', 'chỉ', 'chiếc',
            'cho', 'chứ', 'chưa', 'chuyện', 'có', 'có thể', 'cứ', 'của',
            'cùng', 'cũng', 'đã', 'đang', 'đây', 'để', 'đến', 'đến nỗi', 'đều',
            'điều', 'do', 'đó', 'được', 'dưới', 'gì', 'khi', 'không', 'là',
            'lại', 'lên', 'lúc', 'mà', 'mỗi', 'một', 'một cách', 'này', 'năm'
            'nên', 'nếu', 'ngay', 'nhiều', 'như', 'nhưng', 'những', 'nơi',
            'nữa', 'ở'
            'phải', 'qua', 'ra', 'rằng', 'rằng', 'rất', 'rất', 'rồi', 'sau',
            'sẽ', 'so', 'sự', 'tại', 'theo', 'thì', 'trên', 'trong', 'trước',
            'từ', 'từng', 'và', 'vẫn', 'vào', 'vậy', 'về', 'vì', 'việc', 'với',
            'vừa', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',',
            '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', ']', '^',
            '_', '`', '{', '|', '}', '~'
        ]

        word_list = list()
        while (scroll_size > 0):
            # print('Scrolling ...')
            page = es.scroll(scroll_id=sid, scroll='2m')

            for post in hits:
                post_ = post['_source']['summary']
                word_tok = word_tokenize(post_)
                word_fil = list(filter(lambda x: x not in stop_word, word_tok))
                cnt += Counter(word_fil)
                word_list.append(post_)
            # cal_idf(cnt,word_tok,scroll_size)
            # print(word_list)
            # print (cnt.most_common(10))
            # print(cnt.most_common(10))
            # print (word_tok)
            hits = page['hits']['hits']

            # Update the scroll_id
            sid = page['_scroll_id']

            # Get the number of results
            scroll_size = len(page['hits']['hits'])

        # print(type(cnt))
        tf_dict = cal_tf(cnt, word_tok)
        print('############################')
        idf_dict = cal_idf(cnt, word_list, scroll_size_)
        # print(scroll_size_)
        tf_idf = {}
        for word, val in tf_dict.items():
            tf_idf[word] = val * idf_dict[word]
        sort = Counter(tf_idf)
        # print(sort.most_common(20))
        for item in sort.most_common(20):
            writer.writerow({
                category: str(item[0]),
                tags_list[1]: "{0:.2f}".format(item[1])
            })
Пример #23
0
def remove_stopword(text):
    tokens = word_tokenize(text)
    return " ".join(word for word in tokens if word not in stopwords)
    dt_file = [
        open('./saved_model/dt_model_group-12.pkl', 'rb'), 'Decision Tree'
    ]
    nb_file = [
        open('./saved_model/nb_model_group-12.pkl', 'rb'), 'Naive Bayes'
    ]
    rf_file = [
        open('./saved_model/rf_model_group-12.pkl', 'rb'), 'Random Forest'
    ]
    svm_file = [
        open('./saved_model/svm_model_group-12.pkl', 'rb'),
        'Support Vector Machine'
    ]
    knn_file = [
        open('./saved_model/knn_model_group-12.pkl', 'rb'),
        'K Nearest Neighbor'
    ]

    for file in [svm_file, dt_file, rf_file, nb_file, knn_file]:
        name = file[1]
        model = pickle.load(file[0])
        tokenized_input_text = uts.word_tokenize(input_text, format="text")
        features = get_features(tokenized_input_text)  # type: dict

        feature_values = np.array(list(features.values()))
        feature_values = feature_values.reshape(1, -1)

        print(name, ':       ', model.predict(feature_values))
    # Get input feature

    # print(dt_model.predict(input_text))
Пример #25
0
def vi2IPA_split(texts, delimit):
    content = []
    with open(imp.find_module('viphoneme')[1] + "/Popular.txt",
              encoding="utf-8") as f:
        content = f.read().splitlines()
    tess = texts.split(".")
    Results = ""
    for text in tess:
        #print("------------------------------------------------------")
        TN = TTSnorm(text)
        #TN=text
        #print("------------------------------------------------------")
        #print("Text normalize:              ",TN)
        TK = word_tokenize(TN)
        #print("Vietnamese Tokenize:         ",TK)

        for iuv, under_valid in enumerate(TK):
            token_under = under_valid.split(" ")
            checkinvalid = 0
            ##print(token_under)
            if len(token_under) > 1:
                for tok in token_under:
                    if tok not in content or "[" in T2IPA(tok):
                        checkinvalid = 1
            if checkinvalid == 1:
                TK = TK[:iuv] + TK[iuv + 1:]
                for tok in reversed(token_under):
                    TK.insert(iuv, tok)

        IPA = ""

        for tk in TK:
            ipa = T2IPA_split(tk, delimit).replace(" ", "_")
            if ipa == "":
                IPA += delimit + tk + delimit + " "
            elif ipa[0] == "[" and ipa[-1] == "]":
                eng = eng_to_ipa.convert(tk)
                if eng[-1] == "*":
                    if tk.lower().upper() == tk:
                        ##print("ENGLISH",tk)
                        #Đọc tiếng anh từng chữ
                        letter2sound = ""
                        for char in tk:
                            CHAR = str(char).lower()
                            if CHAR in list(EN.keys()):
                                letter2sound += EN[CHAR] + " "
                            else:
                                letter2sound += char + " "
                        IPA += T2IPA_split(letter2sound, delimit) + " "
                    else:
                        #Giữ nguyên
                        #Future: test experiment" Nếu từ unknow có thể dùng eng_norm để chuyển qua thay thế chứ không cần giữ nguyên như này
                        IPA += Parsing("default", tk.lower(), delimit) + " "
                else:
                    #This use for version english not splited by syllable
                    #IPA+=Parsing("default",eng,delimit)+" "
                    #This version will split english to each syllable
                    IPA += normEng(tk, delimit) + delimit + " "

                #Check tu dien tieng anh Etrain bưc
                #Neu co Mapping
                #Neu khong, check co nguyen am
                #Neu co de nguyen
                #Neu khong danh van
                #print("                                    ..................Out of domain word: " ,ipa)
            else:
                IPA += ipa + " "
        IPA = re.sub(delimit + '+', delimit, IPA)
        IPA = re.sub(' +', ' ', IPA)
        #print("IPA Vietnamese:             ",IPA)
        #print("------------------------------------------------------")
        Results += IPA.rstrip() + " " + delimit + "." + delimit + " "

    return Results.rstrip()
def word_tokenizer(document):
    document=word_tokenize(document, format="text")
    return document
Пример #27
0
def tachtu(sentence):
    sentence = clean_text(sentence)
    sentence = give_emoji_free_text(sentence)
    sentence = word_tokenize(sentence)
    return sentence
Пример #28
0
def clear_unknown_letter(text):
    text = strip_non_alphanum(text)
    text = word_tokenize(text)
    return process_lower(text)
Пример #29
0
def standardized_sentence(sentence):
    sentence = word_tokenize(sentence.lower(), format="text")
    return sentence
Пример #30
0
 def test_decomposed_from(self):
     text = u"yếu"
     acutal = word_tokenize(text)
     expected = [u'yếu']
     self.assertEqual(acutal, expected)