Exemplo n.º 1
0
 def __init__(self):
     self.data_path = os.path.join('.', 'fox_data', 'fox.csv')
     self.tokenizer = WordPunctTokenizer()
Exemplo n.º 2
0
import sys
import pickle
import argparse
from matplotlib import pyplot as plt
from functools import partial
plt.style.use('ggplot')

from anikattu.tokenizer import word_tokenize
from anikattu.tokenstring import TokenString
from anikattu.datafeed import DataFeed, MultiplexedDataFeed
from anikattu.dataset import NLPDataset as Dataset, NLPDatasetList as DatasetList
from anikattu.utilz import tqdm, ListTable
from anikattu.vocab import Vocab
from anikattu.utilz import Var, LongVar, init_hidden, pad_seq, dump_vocab_tsv
from nltk.tokenize import WordPunctTokenizer
word_punct_tokenizer = WordPunctTokenizer()
word_tokenize = word_punct_tokenizer.tokenize

if __name__ == '__main__':
    start = time.time()

    ########################################################################################
    # Parser arguments
    ########################################################################################
    parser = argparse.ArgumentParser(description='MACNet variant 2')
    parser.add_argument('-p',
                        '--hpconfig',
                        help='path to the hyperparameters config file',
                        default='hpconfig.py',
                        dest='hpconfig')
Exemplo n.º 3
0
 def compute_wordset(self):
     tokens = WordPunctTokenizer().tokenize(self.text)
     lowercase = [t.lower() for t in tokens]
     return set(lowercase) - {',', '.', '!', ';', ':', '-', '', None}
Exemplo n.º 4
0
def tokenize(sentence):
    sentence=WordPunctTokenizer().tokenize(sentence.lower())
    return ' '.join(sentence)
Exemplo n.º 5
0
 def wordtokenizer(sentence):
     for i in '! , . ; \' [ ` & # ? : @ < > \ | ] "  "  “ ” _ — - —— & ‘ ’ ( ) / * + ^ = 1 2 3 4 5 6 7 8 9 0':  #去标点数字
         sentence = sentence.replace(i, ' ')
     words = WordPunctTokenizer().tokenize(sentence)
     return words
Exemplo n.º 6
0
    sim21 = (idf2 * (matrix2.dot(matrix1.T).max(axis=1))).sum() / idf2.sum()

    return 2 * sim12 * sim21 / (sim12 + sim21)
    total_len = matrix1.shape[0] + matrix2.shape[0]
    return sim12 * matrix2.shape[0] / total_len + sim21 * matrix1.shape[
        0] / total_len


if __name__ == "__main__":
    w2v = gensim.models.Word2Vec.load('../data/w2v_model_stemmed')

    idf = pickle.load(open('../data/idf'))

    question1 = 'intialize all elements in an ArrayList as a specific integer'
    question1 = WordPunctTokenizer().tokenize(question1.lower())
    question1 = [SnowballStemmer('english').stem(word) for word in question1]

    question2 = 'set every element of a list to the same constant value'
    question2 = WordPunctTokenizer().tokenize(question2.lower())
    question2 = [SnowballStemmer('english').stem(word) for word in question2]

    matrix1 = init_doc_matrix(question1, w2v)
    matrix2 = init_doc_matrix(question2, w2v)
    matrix1_trans = matrix1.T
    matrix2_trans = matrix2.T

    idf1 = init_doc_idf_vector(question1, idf)
    idf2 = init_doc_idf_vector(question2, idf)

    #print sim_question_api(question1, question2, idf, w2v)
Exemplo n.º 7
0
def wordtokenizer(sentence):
    words = WordPunctTokenizer().tokenize(sentence)
    return words
Exemplo n.º 8
0
def main():
    w2v = gensim.models.Word2Vec.load(
        '../data/skip_w2v_model_stemmed')  # pre-trained word embedding
    idf = pickle.load(
        open('../data/my_idf',
             'rb'))  # pre-trained idf value of all words in the w2v dictionary
    records = pickle.load(open("../data/records_final.pkl", 'rb'))
    print(len(records))
    #获取需要推荐的问题
    experiments = util.get_class_experiments()
    print(len(experiments))

    csvfile_path = os.path.join(args.output_path,
                                "topclass_expand11-10.csv")  #输出结果
    csvfile = open(csvfile_path, 'w', newline="")
    writer = csv.writer(csvfile)
    writer.writerow(
        ["question_title", "top5", "ground_truth_intersection", "true_apis"])
    #所有问题的api的集合,看这个集合里面是否有答案存在

    #统计能进行推荐的问题个数,推荐出来的问题的个数
    recommend_num = 0
    recommend_success_num = 0
    processnum = 0
    #统计指标
    mrr = 0.0
    map = 0.0
    precision = 0
    recall = 0
    ndcg = 0.0

    rec_num = args.rec_num
    start = time.clock()
    for experiment in experiments:
        experiment_method_annotation = experiment.method_annotation

        # print(experiment_method_annotation)
        experiment_now_method_flat = experiment.now_method_flat
        experiment_true_api = experiment.true_api
        experiment_now_api = experiment.now_api
        # 求差,取出交集
        experiment_true_api = set(experiment_true_api) - set(
            experiment_now_api)

        query = experiment_method_annotation
        query_words = WordPunctTokenizer().tokenize(query.lower())
        query_words = [
            SnowballStemmer('english').stem(word) for word in query_words
        ]
        query_matrix = similarity.init_doc_matrix(query_words, w2v)
        query_idf_vector = similarity.init_doc_idf_vector(query_words, idf)

        #获取相似的TOP-N问题
        top_questions = similarity.get_topk_questions(query_words,
                                                      query_matrix,
                                                      query_idf_vector,
                                                      records, 11, 0.0)
        #获取得到问题的长度
        # print(top_questions)
        similar_questions_length = len(top_questions)
        # print("similar_questions_length:",similar_questions_length)
        #查看现有问题是否在相似问题中,如果不在则加入,否则直接根据相似问题构建张量
        flag = False

        similar_records_list = list(top_questions.keys())
        for record in similar_records_list:
            if (record.title_words == query_words):
                flag = True
        processnum += 1
        #现有问题在相似问题里面
        record_method_annotation_words = list()
        record_method_flat = list()
        record_api = list()
        for record in similar_records_list:
            if record.title_words not in record_method_annotation_words:
                record_method_annotation_words.append(record.title_words)
            if record.method_block_flat not in record_method_flat:
                record_method_flat.append(record.method_block_flat)
            for api in record.method_api_sequence:
                if api not in record_api:
                    record_api.append(api)
        #加入编程环境中出现的api
        for now_api in experiment_now_api:
            if now_api not in record_api:
                record_api.append(now_api)

        api_rec_all = []

        if flag == True:
            recommend_num += 1
            #构建张量

            print(len(record_method_annotation_words), len(record_method_flat),
                  len(record_api))
            record_method_annotation_words_dict = dict(
                zip(range(len(record_method_annotation_words)),
                    record_method_annotation_words))
            record_method_flat_dict = dict(
                zip(range(len(record_method_flat)), record_method_flat))
            record_api_dict = dict(zip(range(len(record_api)), record_api))
            tensor = np.zeros((len(record_method_annotation_words),
                               len(record_method_flat), len(record_api)),
                              dtype=int)
            for record in similar_records_list:
                for concrete_api in record.method_api_sequence:
                    tensor[list(record_method_annotation_words_dict.keys(
                    ))[list(record_method_annotation_words_dict.values()).
                       index(record.title_words)],
                           list(record_method_flat_dict.keys()
                                )[list(record_method_flat_dict.values()).
                                  index(record.method_block_flat)],
                           list(record_api_dict.keys(
                           ))[list(record_api_dict.values()).index(concrete_api
                                                                   )]] = 1
            for api in experiment_now_api:
                if api in record_api_dict.values():
                    tensor[list(record_method_annotation_words_dict.keys(
                    ))[list(record_method_annotation_words_dict.values()).
                       index(query_words)], :,
                           list(record_api_dict.keys(
                           ))[list(record_api_dict.values()).index(api)]] = 1
            #处理不是张量的情况
            one = query_words
            if len(record_api) == 0:
                continue
            if (len(record_method_annotation_words) == 1
                    or len(record_method_flat) == 1 or len(record_api) == 1):
                if (len(record_method_annotation_words) == 1
                        and len(record_method_flat) == 1 or
                        len(record_method_flat) == 1 and len(record_api) == 1
                        or len(record_api) == 1
                        and len(record_method_annotation_words) == 1):
                    api_rec_all = record_api
                    for m in set(experiment_now_api):
                        if m in api_rec_all:
                            api_rec_all.remove(m)
                elif (len(record_api) == 1):
                    api_rec_all = record_api
                    for m in set(experiment_now_api):
                        if m in api_rec_all:
                            api_rec_all.remove(m)
                else:
                    if (len(record_method_annotation_words) == 1):
                        matrix = tl.unfold(tensor, mode=1)
                        nmf = nimfa.Nmf(matrix,
                                        max_iter=200,
                                        rank=round(min(matrix.shape) / 2),
                                        update='euclidean',
                                        objective='fro')
                        nmf_fit = nmf()
                        W = nmf_fit.basis()
                        H = nmf_fit.coef()
                        matrix = np.dot(W, H)
                        two = list(
                            similarity.get_topk_method_flat(
                                experiment_now_method_flat,
                                list(record_method_flat_dict.values()), 1, 1,
                                -1, 1).values())[0]
                        rec_combine_api_key = np.argsort(
                            -matrix[list(record_method_flat_dict.keys()
                                         )[list(record_method_flat_dict.values(
                                         )).index(two)], :]).tolist()[0]
                        api_rec_all = [
                            record_api_dict[i] for i in rec_combine_api_key
                        ]
                        for m in set(experiment_now_api):
                            if m in api_rec_all:
                                api_rec_all.remove(m)
                    elif (len(record_method_flat) == 1):
                        matrix = tl.unfold(tensor, mode=0)
                        nmf = nimfa.Nmf(matrix,
                                        max_iter=200,
                                        rank=round(min(matrix.shape) / 2),
                                        update='euclidean',
                                        objective='fro')
                        nmf_fit = nmf()
                        W = nmf_fit.basis()
                        H = nmf_fit.coef()
                        matrix = np.dot(W, H)
                        rec_combine_api_key = np.argsort(-matrix[
                            list(record_method_annotation_words_dict.keys(
                            ))[list(record_method_annotation_words_dict.values(
                            )).index(one)], :]).tolist()[0]
                        api_rec_all = [
                            record_api_dict[i] for i in rec_combine_api_key
                        ]
                        for m in set(experiment_now_api):
                            if m in api_rec_all:
                                api_rec_all.remove(m)

            else:
                #张量分解
                tf.reset_default_graph()
                tensor = tl.tensor(tensor).astype(np.float32)
                data_provider = Provider()
                data_provider.full_tensor = lambda: tensor
                env = Environment(data_provider, summary_path='/tensor/ncp_ml')
                ncp = NCP_BCU(env)
                arg = NCP_BCU.NCP_Args(rank=round(
                    min(len(record_method_annotation_words),
                        len(record_method_flat), len(record_api)) / 2),
                                       validation_internal=1)
                ncp.build_model(arg)
                loss_hist = ncp.train(100)
                factor_matrices = ncp.factors
                full_tensor = tl.kruskal_to_tensor(factor_matrices)

                two = list(
                    similarity.get_topk_method_flat(
                        experiment_now_method_flat,
                        list(record_method_flat_dict.values()), 1, 1, -1,
                        1).values())[0]

                rec_combine_api_key = np.argsort(
                    -full_tensor[list(record_method_annotation_words_dict.keys(
                    ))[list(record_method_annotation_words_dict.values()).
                       index(one)],
                                 list(record_method_flat_dict.keys()
                                      )[list(record_method_flat_dict.values()).
                                        index(two)], :]).tolist()
                # 推荐的API列表,去除情境中已经含有的api
                api_rec_all = [record_api_dict[i] for i in rec_combine_api_key]
                for m in set(experiment_now_api):
                    if m in api_rec_all:
                        api_rec_all.remove(m)

        #现有问题不在相似问题里面
        else:
            similar_questions_length += 1

            #去除找不到相似问题的问题
            if similar_questions_length == 1:
                continue
            recommend_num += 1
            #添加新来的query
            record_method_annotation_words.append(query_words)
            print(len(record_method_annotation_words), len(record_method_flat),
                  len(record_api))
            #构建张量
            record_method_annotation_words_dict = dict(
                zip(range(len(record_method_annotation_words)),
                    record_method_annotation_words))
            record_method_flat_dict = dict(
                zip(range(len(record_method_flat)), record_method_flat))
            record_api_dict = dict(zip(range(len(record_api)), record_api))
            tensor = np.zeros((len(record_method_annotation_words),
                               len(record_method_flat), len(record_api)),
                              dtype=int)
            for record in similar_records_list:
                for concrete_api in record.method_api_sequence:
                    tensor[list(record_method_annotation_words_dict.keys(
                    ))[list(record_method_annotation_words_dict.values()).
                       index(record.title_words)],
                           list(record_method_flat_dict.keys()
                                )[list(record_method_flat_dict.values()).
                                  index(record.method_block_flat)],
                           list(record_api_dict.keys(
                           ))[list(record_api_dict.values()).index(concrete_api
                                                                   )]] = 1

            for api in experiment_now_api:
                if api in record_api_dict.values():
                    tensor[list(record_method_annotation_words_dict.keys(
                    ))[list(record_method_annotation_words_dict.values()).
                       index(query_words)], :,
                           list(record_api_dict.keys(
                           ))[list(record_api_dict.values()).index(api)]] = 1
            #处理不是张量分解
            one = query_words
            if len(record_api) == 0:
                continue
            if (len(record_method_annotation_words) == 1
                    or len(record_method_flat) == 1 or len(record_api) == 1):
                if (len(record_method_annotation_words) == 1
                        and len(record_method_flat) == 1 or
                        len(record_method_flat) == 1 and len(record_api) == 1
                        or len(record_api) == 1
                        and len(record_method_annotation_words) == 1):
                    api_rec_all = record_api
                    for m in set(experiment_now_api):
                        if m in api_rec_all:
                            api_rec_all.remove(m)
                elif (len(record_api) == 1):
                    api_rec_all = record_api
                    for m in set(experiment_now_api):
                        if m in api_rec_all:
                            api_rec_all.remove(m)
                else:
                    if (len(record_method_annotation_words) == 1):
                        matrix = tl.unfold(tensor, mode=1)
                        nmf = nimfa.Nmf(matrix,
                                        max_iter=200,
                                        rank=round(min(matrix.shape) / 2),
                                        update='euclidean',
                                        objective='fro')
                        nmf_fit = nmf()
                        W = nmf_fit.basis()
                        H = nmf_fit.coef()
                        matrix = np.dot(W, H)
                        two = list(
                            similarity.get_topk_method_flat(
                                experiment_now_method_flat,
                                list(record_method_flat_dict.values()), 1, 1,
                                -1, 1).values())[0]
                        rec_combine_api_key = np.argsort(
                            -matrix[list(record_method_flat_dict.keys()
                                         )[list(record_method_flat_dict.values(
                                         )).index(two)], :]).tolist()[0]
                        api_rec_all = [
                            record_api_dict[i] for i in rec_combine_api_key
                        ]
                        for m in set(experiment_now_api):
                            if m in api_rec_all:
                                api_rec_all.remove(m)
                    elif (len(record_method_flat) == 1):
                        matrix = tl.unfold(tensor, mode=0)
                        nmf = nimfa.Nmf(matrix,
                                        max_iter=200,
                                        rank=round(min(matrix.shape) / 2),
                                        update='euclidean',
                                        objective='fro')
                        nmf_fit = nmf()
                        W = nmf_fit.basis()
                        H = nmf_fit.coef()
                        matrix = np.dot(W, H)
                        rec_combine_api_key = np.argsort(-matrix[
                            list(record_method_annotation_words_dict.keys(
                            ))[list(record_method_annotation_words_dict.values(
                            )).index(one)], :]).tolist()[0]
                        api_rec_all = [
                            record_api_dict[i] for i in rec_combine_api_key
                        ]
                        for m in set(experiment_now_api):
                            if m in api_rec_all:
                                api_rec_all.remove(m)

            else:
                # 张量分解
                tf.reset_default_graph()
                tensor = tl.tensor(tensor).astype(np.float32)
                data_provider = Provider()
                data_provider.full_tensor = lambda: tensor
                env = Environment(data_provider, summary_path='/tensor/ncp_ml')
                ncp = NCP_BCU(env)
                arg = NCP_BCU.NCP_Args(rank=round(
                    min(len(record_method_annotation_words),
                        len(record_method_flat), len(record_api)) / 2),
                                       validation_internal=1)
                ncp.build_model(arg)
                loss_hist = ncp.train(100)
                factor_matrices = ncp.factors
                full_tensor = tl.kruskal_to_tensor(factor_matrices)
                # one = query_words
                two = list(
                    similarity.get_topk_method_flat(
                        experiment_now_method_flat,
                        list(record_method_flat_dict.values()), 1, 1, -1,
                        1).values())[0]

                rec_combine_api_key = np.argsort(
                    -full_tensor[list(record_method_annotation_words_dict.keys(
                    ))[list(record_method_annotation_words_dict.values()).
                       index(one)],
                                 list(record_method_flat_dict.keys()
                                      )[list(record_method_flat_dict.values()).
                                        index(two)], :]).tolist()
                #推荐的API列表
                api_rec_all = [record_api_dict[i] for i in rec_combine_api_key]
                for m in set(experiment_now_api):
                    if m in api_rec_all:
                        api_rec_all.remove(m)
        #判断结果在相似的问题中有没有出现
        # print(experiment_true_api)
        # print('----------------------------------')
        experiment_true_api = [
            true_api.split('.')[-2] for true_api in experiment_true_api
        ]
        experiment_true_api = removelist(experiment_true_api)
        experiment_now_api = [
            true_api.split('.')[-2] for true_api in experiment_now_api
        ]
        experiment_now_api = removelist(experiment_now_api)
        #去除experiment_now_api
        experiment_true_api = set(experiment_true_api) - set(
            experiment_now_api)
        record_api = [true_api.split('.')[-2] for true_api in record_api]
        record_api = removelist(record_api)
        api_rec_all = [true_api.split('.')[-2] for true_api in api_rec_all]
        api_rec_all = removelist(api_rec_all)
        for m in set(experiment_now_api):
            if m in api_rec_all:
                api_rec_all.remove(m)
        api_rec = api_rec_all[:rec_num]

        pos = -1
        tmp_map = 0.0
        hits = 0.0
        vector = list()
        for i, api in enumerate(api_rec_all[:rec_num]):
            if api in set(experiment_true_api) and pos == -1:
                pos = i + 1
            if api in set(experiment_true_api):
                vector.append(1)
                hits += 1
                tmp_map += hits / (i + 1)
            else:
                vector.append(0)

        tmp_map /= len(set(experiment_true_api))
        tmp_mrr = 0.0
        if pos != -1:
            tmp_mrr = 1.0 / pos
        map += tmp_map
        mrr += tmp_mrr
        ndcg += calculateNDCG.ndcg_at_k(vector[:rec_num], rec_num)
        ground_truth_intersection = set(api_rec).intersection(
            set(experiment_true_api))
        if (len(ground_truth_intersection) > 0):
            recommend_success_num += 1
        precision += len(ground_truth_intersection) / rec_num
        recall += len(ground_truth_intersection) / len(
            set(experiment_true_api))
        writer.writerow([
            experiment_method_annotation, api_rec, ground_truth_intersection,
            experiment_true_api
        ])

    writer.writerow(["recommend_num", "recommend_success_num"])
    writer.writerow([recommend_num, recommend_success_num])
    writer.writerow([
        "mrr/recommend_num", "recommend_num", "map/recommend_num",
        "success_rate@N", "precision@N/recommend_num",
        "recall@N/recommend_num", "ndcg/recommend_num"
    ]),
    writer.writerow([
        mrr / recommend_num, recommend_num, map / recommend_num,
        recommend_success_num / recommend_num, precision / recommend_num,
        recall / recommend_num, ndcg / recommend_num
    ])
    csvfile.close()
    end = time.clock()

    print('Running time: %s Seconds' % (end - start))

    logging.info("Finish")
Exemplo n.º 9
0
def wordata(file, n, wordExcept=False, lenth=False):
    FileName = file.path
    mode = ''
    textHeap = []

    if FileName[-3:] == 'zip':
        print("*Zip 파일을 입력 받습니다. ")
        FileName = getZip(FileName)
        FileName = getFiles(FileName)
        mode = 'Zip'

    #mode에 따라 선택적 처리
    print("*파일 읽기를 시작합니다.")

    if mode == 'Zip':
        for file in FileName:
            textHeap += read_file(file)

    else:
        textHeap = read_file(FileName)

    #집 해제된 폴더안의 파일 삭제
    for file in FileName:
        removeFolder(file)

    #Tokenization
    print("*텍스트 분석을 시작합니다.")
    tokenizer = WordPunctTokenizer()
    TokenizedWords = []

    for text in textHeap:
        TokenizedWords += tokenizer.tokenize(text)
    print("*문서 안의 전체 단어 개수: {}".format(len(TokenizedWords)))
    if mode == 'Zip':
        for file in FileName:
            removeFolder(file)

    #불용어 load
    os.chdir(os.getcwd())
    system_path = os.getcwd()
    now_path = str(system_path) + '\\errorword'
    errorWords = pd.read_csv(now_path + "\\errorWords.csv", header=None)

    stop_words = set(stopwords.words('english'))  # NLTK에서 기본적으로 정의하고 있는 불용어
    stop_words = stop_words | set(pd.Series(errorWords[0]).to_list())

    # 과정별 단어 제거
    if wordExcept == 1:
        elementWord = pd.read_csv(now_path + "\\초등800.csv", header=None)
        stop_words = stop_words | set(pd.Series(elementWord[0]).to_list())
        print('초등 영단어 제거 성공')
    elif wordExcept == 2:
        middleWord = pd.read_csv(now_path + "\\중등2000.csv", header=None)
        stop_words = stop_words | set(pd.Series(middleWord[0]).to_list())
        print('중등 영단어 제거 성공')
    elif wordExcept == 3:
        highWord = pd.read_csv(now_path + "\\고등3000.csv", header=None)
        stop_words = stop_words | set(pd.Series(highWord[0]).to_list())
        print('고등 영단어 제거 성공')

    np_words = np.array(
        TokenizedWords)  # Tokenized words를 numpy array type으로 형 변환
    delete_index = []  # 불용어 index번호를 저장할 list

    print("*1차 불용어 제거를 시작합니다.")
    for i in range(len(np_words)):
        np_words[i] = re.sub("[^a-zA-Z]", "", np_words[i])

        if (np_words[i] in stop_words) == True:
            delete_index.append(i)
        if len(np_words[i]) <= 1:
            delete_index.append(i)

    TrimmedWords = np.delete(np_words, delete_index)  #불용어 index를 삭제
    print('제거 후 단어 수: {}'.format(len(TrimmedWords)))

    # 품사 태깅
    tagged_list = pos_tag(TrimmedWords)
    verb = []
    noun = []
    adject = []
    adverb = []
    for w in tagged_list:
        if w[1] == 'VB' or w[1] == 'VBD' or w[1] == 'VBG' or w[
                1] == 'VBN' or w[1] == 'VBP' or w[1] == 'JJ':
            verb.append(w)
        elif w[1] == 'NNS' or w[1] == 'NNPS' or w[1] == 'NN':
            if len(w[0]) > 3 and w[0][
                    -3] == 'ing':  #만약 현재분사로써 대문자인 -ing 형이 온다면 아래를 실행
                verb.append(w)
            else:
                noun.append(w)
        elif w[1] == 'JJ' or w[1] == 'JJR' or w[1] == 'JJS':
            adject.append(w)
        elif w[1] == 'RBR' or w[1] == 'RBS' or w[1] == 'RB':
            adverb.append(w)

    verb = untag(verb)
    noun = untag(noun)
    adject = untag(adject)
    adverb = untag(adverb)

    restoredVerb = []  # 동사 원형 복원
    for v in verb:
        restoredVerb.append(WordNetLemmatizer().lemmatize(v.lower(), pos='v'))
    restoredNoun = [WordNetLemmatizer().lemmatize(w, pos='n')
                    for w in noun]  #명사 원형 복원
    restoredAdject = [
        WordNetLemmatizer().lemmatize(w, pos='a') for w in adject
    ]  #형용사 원형 복원
    restoredAdverb = [
        WordNetLemmatizer().lemmatize(w, pos='r') for w in adverb
    ]  #부사 원형 복원

    #복원된 데이터 통합
    combinedWords = restoredVerb + restoredNoun + restoredAdject + restoredAdverb
    print("*필터된 단어의 개수: {}".format(len(combinedWords)))

    np_words = np.array(
        combinedWords)  # Tokenized words를 numpy array type으로 형 변환
    delete_index_2 = []  # 불용어 index번호를 저장할 list

    print("*2차 불용어 제거를 시작합니다.")
    for i in range(len(np_words)):
        #     np_words[i] = np_words[i].lower()  #모든 단어를 소문자로 변경
        if (np_words[i] in stop_words) == True:
            delete_index_2.append(i)
        if len(np_words[i]) >= 20 or len(np_words[i]) <= 2:
            delete_index_2.append(i)
    TrimmedWords = np.delete(np_words, delete_index_2)  #불용어 index를 삭제
    print('제거 후 단어 수: {}'.format(len(TrimmedWords)))
    resultWords = TrimmedWords

    overNum = n  #빈도수 개수 이상 단어를 뽑아냄

    print('*중복된 단어의 갯수를 셉니다.')
    cleansing = pd.Series(resultWords).value_counts()
    removedOverlabWords = pd.DataFrame()
    removedOverlabWords['Word'] = cleansing.index
    removedOverlabWords['value count'] = cleansing.values
    removedOverlabWords = removedOverlabWords[
        removedOverlabWords['value count'] > overNum]
    print("*** 단어 분석 완료 ***")
    print('{}개 이상의 빈도수 단어를 추출합니다.'.format(overNum))
    print("최종 단어 수 : {}".format(removedOverlabWords['Word'].count()))

    #그래프 생성
    img_path = str(system_path) + '\\static\\img'

    lenth_1 = 50
    start = 50
    if len(removedOverlabWords) < 50:
        lenth_1 = len(removedOverlabWords)
        start = 10
    cumsum = (np.cumsum(
        removedOverlabWords['value count'][:50]).to_list()[-1]) * 1.2
    count = removedOverlabWords['value count'].to_list()[0] * 1.2

    fig, ax1 = plt.subplots()
    fig.set_size_inches(17, 10)  #크기 바꾸기(inch 단위)

    color = '#B0E0E6'
    ax1.set_ylabel('', color='black',
                   size=30)  # we already handled the x-label with ax1
    ax1.bar(range(1, lenth_1 + 1),
            np.cumsum(removedOverlabWords['value count'][:lenth_1]),
            color=color,
            width=0.5,
            label='Cumsum')

    ax2 = ax1.twinx()

    matplotlib.rc('ytick', labelsize=25)
    matplotlib.rc('xtick', labelsize=30)
    color = '#5c3090'
    ax2.set_ylabel(' ', color='black', size=30)
    ax2.plot(range(1, lenth_1 + 1),
             removedOverlabWords['value count'][:lenth_1],
             color=color,
             label='Freqency',
             linewidth=15)

    lines, labels = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax2.legend(lines + lines2, labels + labels2, prop={'size': 20})
    ax2.set_ylim(start, count)
    ax1.set_ylim(-20, cumsum)
    plt.grid()
    fig.tight_layout()
    fig.savefig(img_path + '\\graph.png')  #현재 figure 저장하기
    print('그래프 생성 완료')

    #lenth 파라미터를 받음
    print('*단어장 형성을 시작합니다.')
    if lenth == False:
        WordsLenth = removedOverlabWords['Word'].to_list()
    else:
        lenth -= 1
        WordsLenth = removedOverlabWords['Word'].loc[:lenth].to_list()

    word_url = []  #단어 검색 후 첫번째 url
    dictlink = 'https://endic.naver.com'  #네이버사전 홈페이지 url

    for i in range(len(WordsLenth)):  #테스트 단어 갯수만큼 반복
        url = "https://endic.naver.com/search.nhn?sLn=kr&query=" + WordsLenth[
            i]  #단어검색 뒤에 영단어를 붙혀 url 넘겨줌
        res = requests.get(url).text  #url을 requests한걸 text로 가져와 res 에 저장
        soup = BeautifulSoup(res, "lxml")  #res를 beautiful soup에 넘겨줌

        for link in soup.findAll(
                "a", href=re.compile("^(/enkrEntry)")
        ):  #a 태그의 href를 가져올건데 (/enkrEntry) 이부분이 가져올 url들에 반복되어 그 href를 다찾음
            if 'href' in link.attrs:  #만약 link의 모든속성에 href가 있따면
                word = link.attrs[
                    'href']  #href의 모든속성을 word에 저장 즉 그 사전검색하면 나오는 두번째 url 저장
                word = dictlink + word  #네이버 사전 홈페이지+2번째 페이지 url을 넘겨준다.
                word_url.append(word)  #합친 url을 word_url의 리스트에 넣음
                break  #첫번째 word_url만 뽑아서 멈춤

    selecter = [
        [
            'span.fnt_k06', 'p.bg span.fnt_e07._ttsText',
            'dd.first p span.fnt_k10', 'span.fnt_syn',
            '#content > div.word_view > div.tit > h3'
        ],
        [
            '#zoom_content > div:nth-child(6) > dl > dt.first.mean_on.meanClass > em > span.fnt_k06',
            '#zoom_content > div:nth-child(6) > dl > dd:nth-child(2) > p.bg > span',
            '#zoom_content > div:nth-child(6) > dl > dd:nth-child(2) > p:nth-child(2) > span',
            '#ajax_hrefContent > li:nth-child(2) > a',
            '#content > div.word_view > div.tit > h3'
        ],
        [
            '#zoom_content > div:nth-child(7) > dl > dt > em > span.fnt_k06',
            '#zoom_content > div:nth-child(7) > dl > dd.first > p.bg > span',
            '#zoom_content > div:nth-child(7) > dl > dd.first > p:nth-child(2) > span',
            '#ajax_hrefContent > li:nth-child(3) > a',
            '#content > div.word_view > div.tit > h3'
        ]
    ]

    def makeVoca(temp_num, dataFrame, soup, selecter, word_name, number, freq):
        global word_url

        if temp_num == 0:
            pass
        else:
            if number == dataFrame.loc[len(dataFrame) - 1][0]:
                word_name = None
                freq = None
                number = None

            elif number > 2 and number == dataFrame.loc[len(dataFrame) - 2][0]:
                word_name = None
                freq = None
                number = None
            elif number > 3 and number == dataFrame.loc[len(dataFrame) - 3][0]:
                word_name = None
                freq = None
                number = None

        words = soup.select(selecter[0])  #단어 뜻
        if len(words) == 0:
            return

        examples = soup.select(selecter[1])  #예문
        if len(examples) == 0:
            example = None
        else:
            example = examples[0].get_text().strip()

        inperprets = soup.select(selecter[2])  #예문 해석
        if len(inperprets) == 0:
            interpretation = None
        else:
            interpretation = inperprets[0].get_text().strip()
        parts = soup.select(selecter[3])  #품사
        if len(parts) == 0:
            part = None
        else:
            part = parts[0].get_text().strip()

        if part == '동사':
            part = 'V'
        elif part == '명사':
            part = 'N'
        elif part == '형용사':
            part = 'adj'
        elif part == '부사':
            part = 'adv'
        else:
            pass

        voca = soup.select(selecter[4])  #두번째 단어
        Words = voca[0].get_text().strip()
        meaning = words[0].get_text().strip()  #단어의 첫번째만 태그 제거하여 리스트에 넘김

        dataFrame.loc[len(dataFrame)] = [
            number, word_name, part, meaning, example, interpretation, freq
        ]
        return dataFrame

    temp_num = 0
    dataFrame = pd.DataFrame(
        columns=['번호', '단어', '품사', '뜻', '예문', '해석', '빈도수'])
    word_number = 1

    for j in range(len(WordsLenth)):  #단어갯수 만큼 반복
        response = requests.get(word_url[j]).text  #url넘기고 요청하여 텍스트로 넘김
        soup_2 = BeautifulSoup(response,
                               "lxml")  # lxml을 이용하여 beautifulsoup으로 넘겨줌
        for i in range(len(selecter)):
            makeVoca(temp_num, dataFrame, soup_2, selecter[i], WordsLenth[j],
                     word_number, removedOverlabWords['value count'].loc[j])
            temp_num = 1
        word_number += 1
    print("***단어장 형성 완료***")
    return dataFrame
Exemplo n.º 10
0
def tokenize_with_wordpunct_tokenizer(text):
    wp_tokenizer = WordPunctTokenizer()
    return wp_tokenizer.tokenize(text)
Exemplo n.º 11
0
        size = os.path.getsize(
            '/Users/wenmi/Desktop/未命名文件夹/车光阳个人文件/大三学校课程/数据挖掘/期末大作业/d30033t/data/'
            + sum_name)
        if size > para2:
            with open(sum_name, 'wt') as f:
                f.write('')
            for j in range(i):
                sentence_index = simi3[j][0]
                with open(sum_name, 'a') as f:
                    f.write(text[sentence_index] + '\n')
            break


text1 = []
for i in range(len(text)):
    words = WordPunctTokenizer().tokenize(text[i])
    text1.append(words)
#分词获得一个list

for ele in text1:
    for i in range(len(ele)):
        ele[i] = lancaster_stemmer.stem(ele[i])
#词干化处理

model0 = gensim.models.Word2Vec(text1)
model0.save('word2vector_model')
model1 = gensim.models.Word2Vec.load('word2vector_model')
#词向量训练,得到模型model1

sentence_vector = []
for ele in text1:
Exemplo n.º 12
0
            title.append(r1[3])
            abstract.append(r1[4])
    conn.close()
    return keywords, article_id, year, title, abstract


keywords, article_id, year, title, abstract = Get_data_in_sql()

words = []
for i in range(len(keywords)):
    keywords[i] = keywords[i].split(";")
    for j in range(len(keywords[i])):
        words.append(keywords[i][j])

for s in title:
    temp = WordPunctTokenizer().tokenize(s)
    for w in temp:
        words.append(w)

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
for i in range(len(abstract)):
    sentences = tokenizer.tokenize(abstract[i])
    for s in sentences:
        temp = WordPunctTokenizer().tokenize(s)
        for w in temp:
            words.append(w)

words_dict = {}
count = 0
stop_words_file = open(r"C:\Users\hjn\Desktop\关键词课题\英文停用词.txt",
                       encoding='utf-8',
Exemplo n.º 13
0
import pickle
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import SnowballStemmer
import math

answer_corpora = pickle.load(open('../data/titles.pkl', 'rb'))

texts = [
    WordPunctTokenizer().tokenize(answer.lower()) for answer in answer_corpora
]

idf = {}

stemmer = SnowballStemmer('english')
cc = 0
for text in texts:

    tmp_set = set()

    cc += 1

    for word in text:
        tmp_set.add(word)

    for word in tmp_set:
        word = stemmer.stem(word)
        if word in idf:
            idf[word] += 1
        else:
            idf[word] = 1
if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    input_file = "ubuntu_train_subtask_1.json"
    output_file = "word2vec_100_embedding.txt"

    sentences = []
    with open(input_file, 'rb') as f:
        json_data = ijson.items(f, 'item')
        for i, entry in enumerate(json_data):
            row = process_dialog(entry)[:-1]  # [ctx, utr0, utr1, ..., utr99]
            # row = select_ahead_hinder_part(row, maxlen=160, hinder=True)
            for uter in row:
                sentences.append(WordPunctTokenizer().tokenize(uter))
            if (i + 1) % 1000 == 0:
                print("Cached {} examples.".format(i + 1))

    model = Word2Vec(sentences,
                     size=100,
                     window=5,
                     min_count=1,
                     sg=1,
                     workers=multiprocessing.cpu_count())

    model.wv.save_word2vec_format(output_file, binary=False)
Exemplo n.º 15
0
# -*- coding: utf-8 -*-
"""
Punctuation tokenizer
"""
#Tokenizes a text into a sequence of alphabetic and non-alphabetic characters.
#splits all punctuations into separate tokens

from nltk.tokenize import WordPunctTokenizer
text = "Hey @airvistara , not #flyinghigher these days we heard? #StayingParkedStayingSafe #LetsIndiGo laugh/cry"
tokens = WordPunctTokenizer().tokenize(text)
print(tokens)
Exemplo n.º 16
0
    def _load_data(self):
        r'''Loading dataset, invoked during the initialization of :class:`MultiTurnDialog`.
		'''
        origin_data = {}
        for key in self.key_name:
            with open('%s/ubuntu_corpus_%s.csv' %
                      (self._file_path, key)) as data_file:
                raw_data = list(csv.reader(data_file))
                head = raw_data[0]
                if head[2] == 'Label':
                    raw_data = [
                        d[0] + d[1] for d in raw_data[1:] if d[2] == '1.0'
                    ]
                else:
                    raw_data = [d[0] + d[1] for d in raw_data[1:]]

                raw2line = lambda raw: [WordPunctTokenizer().tokenize(sent) \
                  for sent in raw.strip().replace('__eou__', '').split('__eot__')]
                origin_data[key] = {'session': list(map(raw2line, raw_data))}

        raw_vocab_list = list(
            chain(*chain(*(origin_data['train']['session']))))
        # Important: Sort the words preventing the index changes between different runs
        vocab = sorted(Counter(raw_vocab_list).most_common(),
                       key=lambda pair: (-pair[1], pair[0]))
        left_vocab = list(
            filter(lambda x: x[1] >= self._min_vocab_times, vocab))
        left_vocab = list(map(lambda x: x[0], left_vocab))
        vocab_list = self.ext_vocab + left_vocab
        valid_vocab_len = len(vocab_list)
        valid_vocab_set = set(vocab_list)

        for key in self.key_name:
            if key == 'train':
                continue
            raw_vocab_list.extend(
                list(chain(*chain(*(origin_data[key]['session'])))))
        vocab = sorted(Counter(raw_vocab_list).most_common(), \
              key=lambda pair: (-pair[1], pair[0]))
        left_vocab = list( \
         filter( \
          lambda x: x[1] >= self._invalid_vocab_times and x[0] not in valid_vocab_set, \
          vocab))
        left_vocab = list(map(lambda x: x[0], left_vocab))
        vocab_list.extend(left_vocab)

        print("valid vocab list length = %d" % valid_vocab_len)
        print("vocab list length = %d" % len(vocab_list))

        word2id = {w: i for i, w in enumerate(vocab_list)}
        line2id = lambda line: ([self.go_id] + list(\
           map(lambda word: word2id.get(word, self.unk_id), line)) + \
           [self.eos_id])[:self._max_sent_length]

        data = {}
        data_size = {}
        for key in self.key_name:
            data[key] = {}
            data[key]['session'] = [list(map(line2id, session[:self._max_turn_length])) \
              for session in origin_data[key]['session']]
            data_size[key] = len(data[key]['session'])
            vocab = list(chain(*chain(*(origin_data[key]['session']))))
            vocab_num = len(vocab)
            oov_num = len(list(filter(lambda word: word not in word2id,
                                      vocab)))
            invalid_num = len(
                list(filter(lambda word: word not in valid_vocab_set,
                            vocab))) - oov_num
            sent_length = list(map(len, chain(*origin_data[key]['session'])))
            cut_word_num = np.sum(
                np.maximum(
                    np.array(sent_length) - self._max_sent_length + 2, 0))
            turn_length = list(map(len, origin_data[key]['session']))
            sent_num = np.sum(turn_length)
            cut_sent_num = np.sum(
                np.maximum(np.array(turn_length) - self._max_turn_length, 0))
            print(("%s set. invalid rate: %f, unknown rate: %f, max sentence length before cut: %d, " + \
              "cut word rate: %f\n\tmax turn length before cut: %d, cut sentence rate: %f") % \
              (key, invalid_num / vocab_num, oov_num / vocab_num, max(sent_length), \
              cut_word_num / vocab_num, max(turn_length), cut_sent_num / sent_num))
        return vocab_list, valid_vocab_len, data, data_size
Exemplo n.º 17
0
def readData():
    reviewfileName = 'reviews.csv'
    reviewTermFileName = 'Terms.csv'
    reviewSummaryFileName = 'Summary.csv'
    targetReview = open(reviewfileName, 'w', encoding='utf-8')
    targetTerm = open(reviewTermFileName, 'w', encoding='utf-8')
    targetSummary = open(reviewSummaryFileName, 'w', encoding='utf-8')

    tagList = [
        "CD", "CC", "DT", "JJ", "JJR", "JJS", "PDT", "NN", "PRP$", "PRP",
        "NNS", "NNP"
    ]
    tagListNumeric = ["CD"]

    reviewID = 1
    word_punct_tokenizer = WordPunctTokenizer()
    with open('reviews_Baby.json') as f:
        for line in f:
            oneReviewData = json.loads(line)
            if 'reviewerName' in oneReviewData:
                reviewerName = oneReviewData['reviewerName'].replace(
                    "'", "").replace('"', "").replace(",",
                                                      " ").replace("\\", " ")
            if 'reviewerID' in oneReviewData:
                reviewerID = oneReviewData['reviewerID'].replace(
                    "'", "").replace('"', "").replace(",", " ")
            if 'asin' in oneReviewData:
                asin = oneReviewData['asin'].replace("'", "").replace(
                    '"', "").replace(",", " ")
            if 'helpful' in oneReviewData:
                helpful = str(oneReviewData['helpful']).replace(",", " ")
            if 'overall' in oneReviewData:
                overall = oneReviewData['overall']
            if 'summary' in oneReviewData:
                summary = oneReviewData['summary'].replace("'", "").replace(
                    '"', "").replace(",", " ")
            if 'reviewTime' in oneReviewData:
                reviewTime = oneReviewData['reviewTime'].replace(
                    "'", "").replace('"', "").replace(",", " ")
            if 'unixReviewTime' in oneReviewData:
                unixReviewTime = oneReviewData['unixReviewTime']

            review = reviewerID + ',' + asin + ',' + reviewerName + ',' + helpful + ',' + str(
                overall) + ',' + summary.replace(
                    "'", "").replace('"', "") + ',' + str(reviewTime).replace(
                        ',',
                        "") + ',' + str(unixReviewTime) + ',' + str(reviewID)
            termText = ""
            summaryText = ""

            if (oneReviewData['reviewText'] is not None
                    and oneReviewData['reviewText'] != ''):
                reviewText = oneReviewData['reviewText'].replace("-", " ")
                termText = parseGrammer(reviewText, reviewID).strip()
            if (oneReviewData['summary'] is not None
                    and oneReviewData['summary'] != ''):
                reviewSummary = oneReviewData['summary'].replace("-", " ")
                summaryText = parseGrammer(reviewSummary, reviewID).strip()
            if (reviewID != None
                    and ((termText != None and termText != "") or
                         (summaryText != None and summaryText != ""))):
                targetReview.write(review + '\n')
                targetReview.flush()

                if (termText != None and termText != ""):
                    targetTerm.write(termText + '\n')
                    targetTerm.flush()
                if (summaryText != None and summaryText != ""):
                    targetSummary.write(summaryText + '\n')
                    targetSummary.flush()
                #print(termText)
            print(reviewID)
            reviewID += 1

    targetReview.close()
    targetTerm.close()
    targetSummary.close()
Exemplo n.º 18
0
print("Reading CSV File ...")
with open('data/reddit-comments-2015-08.csv', 'r') as f:
    reader = csv.reader(f, skipinitialspace=True)

    fields = next(reader)
    for row in reader:

        ##different ways to tokeinze the sentence
        #tokens_nltk = word_tokenize(row[0])
        #tokens_split = row[0].split()
        #print("The length of the list with"len(tokens_nltk))
        #print(len(tokens_split))
        #print(len(match_tokenizer.tokenize(row[0])))

        ##words and punctuations tokenizer
        sentence = WordPunctTokenizer().tokenize(row[0])
        #print(len(sentence))
        temp = [sentence_start_token] + sentence + [sentence_end_token]
        #print(len(tokenized_sentence))
        #data_analysis = FreqDist(tokenized_sentence)
        #data_analysis.plot(100, cumulative=False)
        tokenized_sentence.append(temp)
        num_sentences += 1

#print("The number of sentences read is %d"%(num_sentences))
#print(tokenized_sentence)

##to get the freqency distribution
all_sentences = []
for x in tokenized_sentence:
    all_sentences += x
Exemplo n.º 19
0
from django.core.management.base import BaseCommand
import os
import optparse
import numpy as np
import pandas as pd
import math
import json
import copy
from BeautifulSoup import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
tknzr = WordPunctTokenizer()
#nltk.download('stopwords')
stoplist = stopwords.words('english')
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
from sklearn.feature_extraction.text import TfidfVectorizer
from books_recsys_app.models import MovieData
from django.core.cache import cache



#python manage.py load_data --input=plots.csv --nmaxwords=30000  --umatrixfile=umatrix.csv
class Command(BaseCommand):

    option_list = BaseCommand.option_list + (
            optparse.make_option('-i', '--input', dest='input',
                                 type='string', action='store',
                                 help=('Input plots file')),
            optparse.make_option('--nmaxwords', '--nmaxwords', dest='nmaxwords',
Exemplo n.º 20
0
def ann_to_bio(corpus, bio_file):
    def get_bio_tag(index, entities):
        # get bio tag based on the start index
        for entity in entities:
            if entity.start == index:
                return 'B'
            elif entity.start < index < entity.end:
                return 'I'
        return 'O'

    # db and regex
    atom_db = leveldb.LevelDB('data/atom_db')
    upper_number = re.compile(r'^[A-Z]+[0-9]+[A-Z]*$')
    punctuation = re.compile(r'^[^A-Za-z0-9]+$')

    # sequence output template
    template = '{token}\t{pos}\t{pos_2}\t{lower}\t{is_upper}\t' \
               '{is_title}\t{is_first}\t{is_digit}\t{is_punkt}\t' \
               '{is_upper_number}\t{suffix_3}\t{suffix_2}\t{prefix_3}\t{prefix_2}\t' \
               '{in_db}\t{label}\n'

    # sentence splitter
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

    # use WordPunctTokenizer to split disease-suppressor to disease, -, suppressor
    word_punct_tokenizer = WordPunctTokenizer()

    # ann reader
    reader = AnnReader()

    # open the bio file
    bio_file_handler = open(bio_file, 'a')

    for root, _, files in os.walk(corpus):
        for file in files:
            if not file.endswith('.ann'):
                continue

            pmid = file[:-4]
            annotation = reader.parse_file(os.path.join(root, file))
            entities = annotation.entities
            entities = sorted(entities, key=lambda a: a.start)
            text = FileProcessor.read_file(os.path.join(root, pmid + '.txt'))
            sentences = sent_detector.tokenize(text.strip())
            index = 0

            for sentence in sentences:
                # tokenization
                tokens = word_punct_tokenizer.tokenize(sentence)
                # get pos tag
                pos_tags = nltk.pos_tag(tokens)

                for i, pos_tag in enumerate(pos_tags):
                    token, pos = pos_tag
                    index = text.find(token, index)

                    if index == -1:
                        raise Exception
                    # get bio tag
                    bio_tag = get_bio_tag(index, entities)

                    try:
                        atom_db.Get(token.lower().encode('utf-8'))
                        in_db = True
                    except KeyError:
                        in_db = False

                    is_upper_number = False if upper_number.match(
                        token) is None else True
                    is_punctuation = False if punctuation.match(
                        token) is None else True

                    bio_file_handler.write(
                        template.format(token=token,
                                        pos=pos,
                                        pos_2=pos[:2],
                                        lower=token.lower(),
                                        is_upper=token.isupper(),
                                        is_title=token.istitle(),
                                        is_first=(i == 0),
                                        is_digit=token.isdigit(),
                                        is_punkt=is_punctuation,
                                        is_upper_number=is_upper_number,
                                        suffix_3=token[-3:],
                                        suffix_2=token[-2:],
                                        prefix_3=token[:3],
                                        prefix_2=token[:2],
                                        in_db=in_db,
                                        label=bio_tag))
                    index += len(token)

                # add a newline to separate sentence
                bio_file_handler.write('\n')

    bio_file_handler.close()
Exemplo n.º 21
0
import fasttext as fs
from pathlib import Path
from nltk.tokenize import WordPunctTokenizer
import numpy as np
from typing import List, Any, Tuple, Dict
from keras.models import model_from_json
import pickle
import os

TOKENIZER = WordPunctTokenizer()


class TextClassifier:

    __instance = None

    @staticmethod
    def getInstance():
        if TextClassifier.__instance == None:
            TextClassifier()
        return TextClassifier.__instance

    def __init__(self, model_path={}):

        if TextClassifier.__instance != None:
            raise Exception("This class is a singleton!")
        else:
            TextClassifier.__instance = self

        self.__w_v_s = 100
        self.__sen_s = 20
Exemplo n.º 22
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import collections
import math
import os
import random

import numpy as np
import tensorflow as tf
import pickle
import nltk
from nltk.tokenize import WordPunctTokenizer
from path import *
word_cut = WordPunctTokenizer()
tokenizer = nltk.data.load(nltk_path)

# Read the data into a list of strings.
# def read_data(file_path):
#   dirs = os.listdir(file_path)
#   data = list()
#   for dir in dirs:
#       f = open(file_path + "/" + dir, "r")
#       for line in f:
#           line = line.replace('.', '')
#           line = line.replace('!', '')
#           line = line.replace('<br /><br />', '')
#           line = line.replace('?', '')
#           line = line.replace('*', '')
#           line = line.replace(',', '')
#           data.extend(line.split())
Exemplo n.º 23
0
 def __init__(self):
     self.word_tokenizer = WordPunctTokenizer()
Exemplo n.º 24
0
 def __init__(
     self,
     punct: bool = None,
 ):
     self.WPtokenizer = WordPunctTokenizer()
     self.punct = punct
Exemplo n.º 25
0
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, WordPunctTokenizer

# Define input text
input_text = "DO you know how tokenization works? It's actually quite interesting! Let's analyse a couple of sentences and figure it out."

# Sentence tokenizer
print("\n Sentence Tokenizer:")
print(sent_tokenize(input_text))

# wordPunt tokenizer
print("\nWord punct tokenizer:")
print(WordPunctTokenizer().tokenize(input_text))
# 주어진 코퍼스(corpus)에서 토큰(token)이라 불리는 단위로 나누는 작업을 토큰화(tokenization)라고 부릅니다.
# 토큰의 단위가 상황에 따라 다르지만, 보통 의미있는 단위로 토큰을 정의합니다.

# ------------------------------------------------------------------------------------
# 1. 단어 토큰화 Word Tokenization : 토큰의 기준이 단어일 경우 (각 기능에 대한 것은 사용할 떄 찾아볼 것)
# 1) word_tokenize
from nltk.tokenize import word_tokenize
print(word_tokenize("Don't be a fooled by the dark sounding name, Ms. Hanna's Orphanage is as cheery as cheery goes for a party shop."))

# ['Do', "n't", 'be', 'a', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', \
# 'Ms.', 'Hanna', "'s", 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', \
# 'for', 'a', 'party', 'shop', '.']

# 2) WordPunctTokenizer
from nltk.tokenize import WordPunctTokenizer
print(WordPunctTokenizer().tokenize("Don't be a fooled by the dark sounding name, Ms. Hanna's Orphanage is as cheery as cheery goes for a party shop."))
# ['Don', "'", 't', 'be', 'a', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ','\
# , 'Ms', '.', 'Hanna', "'", 's', 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', \
# 'goes', 'for', 'a', 'party', 'shop', '.']

# 3) text_to_word_sequence
from tensorflow.keras.preprocessing.text import text_to_word_sequence
print(text_to_word_sequence("Don't be a fooled by the dark sounding name, Ms. Hanna's Orphanage is as cheery as cheery goes for a party shop."))
# ["don't", 'be', 'a', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', 'ms', \
# "hanna's", 'orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', \
# 'a', 'party', 'shop']

# 4) TreebankWordTokenizer
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
text = "Starting a home-based restaurant may be an ideal. it doesn't have a food chain or restaurant of their own."
Exemplo n.º 27
0
"""
toy example for word2vec
"""
from nltk.tokenize import WordPunctTokenizer, PunktSentenceTokenizer
from gensim.models import Word2Vec
from scipy.spatial.distance import cosine
tokenizer = WordPunctTokenizer()
sent_tok = PunktSentenceTokenizer()
with open('brown.txt') as inp:
    data = inp.read()
sentences = sent_tok.tokenize(data)
for i in xrange(0, len(sentences)):
    sentences[i] = tokenizer.tokenize(sentences[i].lower())

model = Word2Vec(sentences, size=50, iter=3)
model.save('brown.w2v')
Exemplo n.º 28
0
def sentence_to_words(sent):
    words = WordPunctTokenizer().tokenize(sent)
    return words
"""
Get LIWC counts over a block of text
for a specified category.
"""
import os, re
from collections import Counter
from nltk.tokenize import WordPunctTokenizer

TKNZR = WordPunctTokenizer()


def get_category_counts(text_string, category_patterns):
    """
    Compute category counts (from words) in raw
    text string. Assumes that tokenization is 
    not needed!
    
    Parameters:
    -----------
    text_string : str
    category_patterns : [re.Pattern]
    
    Returns:
    --------
    category_counts : {str : int}
    """
    category_counts = []
    text_string = text_string.lower()
    #for t in TKNZR.tokenize(text_string):
    for pattern in category_patterns:
        p = r'\b' + pattern.pattern + r'\b'
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)    
    return text

df_yelp_review['text'] = df_yelp_review['text'].apply(clean_text)

vectorizer_reviews = CountVectorizer(min_df = .01,max_df = .99, tokenizer = WordPunctTokenizer().tokenize)
vectorized_reviews = vectorizer_reviews.fit_transform(df_yelp_review['text'])

print(vectorized_reviews.shape) 

' | '.join(vectorizer_reviews.get_feature_names()[:100]) # only the first 100   

tf_idf_vect = TfidfVectorizer(ngram_range=(1,2), min_df=10)
tf_idf_vect.fit(df_yelp_review['text'])
print("some sample features(unique words in the corpus)",tf_idf_vect.get_feature_names()[0:10])

final_tf_idf = tf_idf_vect.transform(df_yelp_review['text'])
print("the type of count vectorizer ",type(final_tf_idf))
print("the shape of out text TFIDF vectorizer ",final_tf_idf.get_shape())
print("the number of unique words includin-g both unigrams and bigrams ", final_tf_idf.get_shape()