예제 #1
0
def trainig_set(review_doc):
    min_num = 100
    nlp = spacy.load("en_core_web_sm")

    #star_rating_list, reviews = load_csv_info("10000reviews.txt")    
    star_rating_list, reviews = load_csv_info(review_doc)   

    for ii, review in enumerate(reviews):
        res = len(review.split()) 
        if res > min_num:
            reviews[ii] = ' '.join(review.split()[0:min_num])
    
    reviews_12, reviews_45, reviews1245, star_12, star_45, star1245 = split_reviews(star_rating_list, reviews)
    
    #Prior probability
    prior_neg, prior_pos = prior_of_the_classes(reviews_12, reviews_45)

    
    ######## NEGATIVE REVIEWS ######## 
    text = '\n'.join(reviews_12)
    pre_processed_12 = pp.pre_process(text, nlp)
    freq12, most_common12 = frequency(pre_processed_12)
    
    ######## POSITIVE REVIEWS ######## 
    text = '\n'.join(reviews_45)
    pre_processed_45 = pp.pre_process(text, nlp)
    freq45, most_common45 = frequency(pre_processed_45)
    
    pre_processed_1245 = pre_processed_12 + pre_processed_45
    vocabulary, voc_common = frequency(pre_processed_1245)
    
    return freq12, most_common12, freq45, most_common45, vocabulary, voc_common, prior_neg, prior_pos
예제 #2
0
def train_clf():
    df = load_dataset()

    X = df['Messages']
    y = df['target']

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    X_train = pd.Series(pre_process(X_train))
    y_train = np.where(y_train == 'ham', 0, 1)
    vect = TfidfVectorizer(min_df=5, ngram_range=(2, 5)).fit(X_train)
    with open("./vect.pkl", 'wb') as f:
        pickle.dump(vect, f)

    X_train_vectorized = vect.transform(X_train)

    X_test = pd.Series(pre_process(X_test))
    y_test = np.where(y_test == 'ham', 0, 1)
    X_test_vectorized = vect.transform(X_test)

    X_train_len = X_train.apply(len)
    X_train_digits = X_train.str.count(r'\d')
    X_train_non = X_train.str.count(r'\W')

    X_test_len = X_test.apply(len)
    X_test_digits = X_test.str.count(r'\d')
    X_test_non = X_test.str.count(r'\W')

    X_train_1 = add_feature(X_train_vectorized, X_train_len)
    X_train_2 = add_feature(X_train_1, X_train_digits)
    X_train_vect = add_feature(X_train_2, X_train_non)

    X_test_1 = add_feature(X_test_vectorized, X_test_len)
    X_test_2 = add_feature(X_test_1, X_test_digits)
    X_test_vect = add_feature(X_test_2, X_test_non)

    # clf = SVC(C=100,kernel="linear").fit(X_train_vect,y_train)
    # clf = LogisticRegression(C=10000,solver='lbfgs').fit(X_train_vect,y_train)
    clf = MLPClassifier(solver='adam',
                        activation='relu',
                        hidden_layer_sizes=(600, 150),
                        learning_rate_init=3e-4,
                        batch_size=256,
                        random_state=0).fit(X_train_vect, y_train)
    y_predictions = clf.predict(X_test_vect)
    train_accuracy = clf.score(X_train_vect, y_train)
    test_accuracy = clf.score(X_test_vect, y_test)
    precision = precision_score(y_test, y_predictions)
    recall = recall_score(y_test, y_predictions)
    auc = roc_auc_score(y_test, y_predictions)
    confusion_Matrix = confusion_matrix(y_test, y_predictions)

    print("Train : ", train_accuracy)
    print("Test : ", test_accuracy)
    print("Precision : ", precision)
    print("Recall : ", recall)
    print("AUC : ", auc)
    print("Confusion Matrix : \n", confusion_Matrix)

    with open('./model.pkl', 'wb') as f:
        pickle.dump(clf, f)
예제 #3
0
def manage():
    arg = sys.argv[1]

    if arg == 'pre_process':
        pre_process()
    elif arg == 'cf':
        user_cf()
    elif arg == 'lfm':
        lfm()
    elif arg == 'personal_rank':
        personal_rank()
    else:
        raise ValueError('Args must in ["pre_process", "cf", "lfm", "personal_rank"].')

    sys.exit(0)
예제 #4
0
    def test_process_one_review__negative(self):
        original_classification = 0
        Y_true = []
        Y_pred = []
        ignored_non_english = [0]
        #TODO good study case for the word worship
        review = "I'm sorry, but calling Jesus the messiah is as kosher as eating a ham " \
                 "and cheese sandwich on Yom Kippur.  Bringing about world peace, " \
                 "having the world worship the same G-d, and the return of the Jewish diaspora" \
                 " back to the land of Israel were hardly accomplished by who the Christians" \
                 " believe to be the Messiah.  If he truly was Moshiach," \
                 " then none of these discussions would be taking place.<br /><br />" \
                 "When the Moshiach does come, it will end hatred and intolerance--" \
                 "unlike what happened when Jesus' followers tried to propagate their faith."
        pre_processed_review = pp.pre_process(text=review, nlp=nlp)

        bl.process_one_review_pp(
            original_classification=original_classification,
            pre_processed_review=pre_processed_review,
            ignored_non_english=ignored_non_english,
            Y_true=Y_true,
            Y_pred=Y_pred)
        expected = [0], [0]

        self.assertEqual(expected, (Y_true, Y_pred))
예제 #5
0
def get_processed_data(train_data_path, test_data_path, is_to_save):
    """
    train data , test data 预处理
    :param train_data_path: 训练数据路径
    :param test_data_path: 测试数据路径
    :return: dataframe, 处理后数据
    """
    train_df = pre_process(train_data_path)
    test_df = pre_process(test_data_path)

    # 保存预处理后的数据
    if is_to_save:
        save_to_csv(train_df, save_path=config.train_seg_path)
        save_to_csv(test_df, save_path=config.test_seg_path)

    return train_df, test_df
예제 #6
0
def batch_preprocess(
    fundamental_frequency_in_blocks,
    voiced_samples, 
    rms):
    """
    batch_preprocess(fundamental_frequency_in_blocks,voiced_samples,rms)

                    This is the pre-process or pre-synthesis stage. This module computes the
                    samples for the begining of utterances and finally computes the selected_inflect_block
            Parameters: fundamental_frequency_in_blocks-This is a fundamental frequency(or pitch)
                                    for the blocks in Chunk_Size
                                    voiced_samples-This are samples that contain the voiced samples.
                                    rms-is the root mean square computation
            Returns:	selected_inflect_block- are the blocks that are important for the synthesis process
    """

    voice_sample_begin = prep.utterance_region_begin_samples(voiced_samples)
    voice_chunk_sample = prep.utterance_chunk(
        voiced_samples, voice_sample_begin[1])
    inflection_voice_samples = prep.pre_process(voice_chunk_sample)
    #frequency_of_voiced_samples = fundamental_frequency_in_blocks[voiced_samples]
    #frequency_for_inflection = prep.potential_inflection_fundamental_frequency(frequency_of_voiced_samples)
    inflection_sample_numbers = prep.matrix_of_sample_numbers(
        rms[voice_sample_begin[0]], inflection_voice_samples)
    selected_inflect_block = prep.selected_inflect_block_new(
        inflection_sample_numbers)
    return selected_inflect_block
예제 #7
0
def load_data(config: Config, transformer: TokensToNumbers, type):
    lines = read_data_file(config.get_data_path(type))
    tokens = pre_process(
        lines, config.baseline_config.embedding_config.pre_processing)
    return transformer.transform(
        config.baseline_config.embedding_config.input_type, tokens,
        config.max_sequence_length)
def check_comarca_spelling(df_text,comarca_df,stopwords):
    '''Homogenize the comarca spelling data.
    Compare the comarca spelling from the dataframe with the one
    from the locations data (both preprocessed). If the dataframe one 
    is found, the value from the locations data will be return.
    Useful to recognize comarcas without the correct accentuation
    missing determinants...'''
    comarca_df = comarca_df.unique()
    coms_prep  = [prep.pre_process(x,stopwords,sw=True) for x in comarca_df]
    loc_prep   = prep.pre_process(df_text, stopwords,sw=True)
    if loc_prep in coms_prep:
        ind = coms_prep.index(loc_prep)
        return(comarca_df[ind])
    else:
        if df_text != '':
            print('\tComarca not found:',df_text)
        return('(NOTFOUND)')
예제 #9
0
def classify_tweet(tweet, vectorizer, classifier):
    tweets = []
    tweet = preprocess.pre_process(tweet)
    tweets.append(tweet)
    tweets = convert_to_feature_dicts(tweets, False, 0)
    data = vectorizer.transform(tweets)
    result = classifier.predict(data)
    return result[0]
예제 #10
0
 def predict(self, text):
     x = pre_process(text)
     x = torch.tensor(x, dtype=torch.long)
     pred = self.model(x).detach()
     pred = F.softmax(pred).cpu().numpy()
     pred = pred.argmax(axis=1)
     pred = STARS[pred[0]]
     return STARS_SENTIMENT[pred]
예제 #11
0
def read_training_set(filename):
    tweets = []
    labels = []
    f = open(filename)
    for line in f:
        tweet_dict = json.loads(line)
        tweets.append(preprocess.pre_process(tweet_dict['text']))
        labels.append(int(tweet_dict["label"]))
    return tweets, labels
예제 #12
0
def classify():
    global tokenizer, model, classes
    question = request.args.get('question')
    question = pre_process(np.array([question]), c_sinonimo, metodo)
    question = bow_transform(question, tokenizer)
    #ret = model.predict(np.array([question]))
    ret = model.predict(question)
    # print(ret)
    return json.dumps(classes[int(ret[0])])
예제 #13
0
def svm_train(sc, top_path, stopwords_dict=None):
    #   留个词词典接口,如果有新的词典,把词典放到该目录下
    curpath = os.path.normpath(
        os.path.join(os.getcwd(), os.path.dirname(__file__)))
    if stopwords_dict is None:
        stopwords = set(
            read_file(os.path.join(curpath, u"stopwords.txt")).split())
    else:
        stopwords = set(
            read_file(os.path.join(curpath, u"stopwords_dict.txt")).split())

    #   形成两类的文件夹的每个文本分词,去停用词,词频统计结果{'pos':[counter,..],'neg':[counter]}

    sub_folder = os.listdir(top_path)
    if len(sub_folder) != 2:
        raise OSError("need and only need two folder")

    top_folder_dict = {}
    for name in sub_folder:
        top_folder_dict[name] = pre_process(os.path.join(top_path, name),
                                            stopwords)

    #   选出两类直接区分度最大的词作为这两类的特征词集
    topk = 500
    features = feature_selection(top_folder_dict[sub_folder[1]],
                                 top_folder_dict[sub_folder[0]], topk)

    #   计算两类的IDF
    IDF = idf(top_folder_dict[sub_folder[1]], top_folder_dict[sub_folder[1]],
              features)

    #   每一类每一篇文本在指定二分类下的向量表示[(),()...]
    vector1 = {
        '1.0': feature_vector(tf(top_folder_dict[sub_folder[1]], features),
                              IDF)
    }
    vector0 = {
        '0.0': feature_vector(tf(top_folder_dict[sub_folder[0]], features),
                              IDF)
    }

    #   转为Spark所需要的输入格式[Labpoint(0.0,[]),...]
    labpoint1 = [LabeledPoint(1.0, list) for list in vector1['1.0']]
    labpoint0 = [LabeledPoint(0.0, list) for list in vector0['0.0']]
    train_data = labpoint1 + labpoint0

    classifier = SVMWithSGD.train(sc.parallelize(train_data))

    path = os.path.join(curpath,
                        'svm_' + sub_folder[1] + '_' + sub_folder[0] + '.pkl')
    if os.path.isfile(path): os.remove(path)

    with open(path, 'wb') as output:
        pickle.dump((features, IDF, classifier), output)
예제 #14
0
def build_output_models(infile, outfile, models_to_run, run_on_sample,
                        grid_size):

    df = pre.pre_process(infile)
    # define grid to use: test, small, large
    clfs, grid = define_clfs_params(grid_size)
    df_sub = df.sample(frac=.25)

    # define models to run
    # models_to_run=['RF','DT','KNN', 'ET', 'AB', 'GB', 'LR', 'NB']
    # models_to_run=['RF','DT','KNN', 'ET', 'AB', 'GB', 'LR', 'NB']
    # Logistic Regression, K-Nearest Neighbor, Decision Trees, SVM, Random Forests, Boosting, and Bagging.
    # models_to_run=['KNN', 'RF', 'LR', 'DT', 'AB', 'SVM']
    # models_to_run=['DT', 'RF', 'AB', 'KNN', 'LR', 'SVM']
    # models_to_run=['RF']
    # models_to_run=['LR']

    # load data from csv
    # df = pd.read_csv("/Users/rayid/Projects/uchicago/Teaching/MLPP-2017/Homeworks/Assignment 2/credit-data.csv")

    # COME BACK HERE - select features to use
    features = [
        col for col in df if col not in [
            "projectid", "projectid", "teacher_acctid", "schoolid",
            "school_ncesid", "school_latitude", "school_longitude",
            "school_city", "school_state", "school_metro", "school_district",
            "school_county", "teacher_prefix", "primary_focus_subject",
            "primary_focus_area"
            "secondary_focus_subject", "secondary_focus_area", "resource_type",
            "poverty_level", "grade_level", "projectid", "teacher_acctid",
            "schoolid"
            "school_ncesid", "school_latitude", "school_longitude",
            "school_city", "school_state", "school_metro", "school_district",
            "school_county", "teacher_prefix", "primary_focus_subject",
            "primary_focus_area", "secondary_focus_subject",
            "secondary_focus_area", "resource_type", "poverty_level",
            "grade_level", "total_price_including_optional_support",
            "students_reached", "date_posted", "datefullyfunded", "dif",
            "less_60"
        ]
    ]

    if run_on_sample == 1:
        results_df = clf_loop(models_to_run, clfs, grid, df_sub, features,
                              "output/sample_mod_v2")
    else:
        results_df = clf_loop(models_to_run, clfs, grid, df, features,
                              "output/sample_mod_v2_")
    # save to csv
    results_df.to_csv(outfile, index=False)
예제 #15
0
def load_data_for_conssed(config: Config, transformer: TokensToNumbers, type):
    lines = read_data_file(config.get_data_path(type))
    data = []
    for part_config in [
            config.conssed_config.semantic_part,
            config.conssed_config.sentiment_part
    ]:
        tokens = pre_process(lines,
                             part_config.embedding_config.pre_processing)
        data.append(
            transformer.transform(part_config.embedding_config.input_type,
                                  tokens, config.max_sequence_length))

    sem_data, sen_data = data
    return sem_data, sen_data
예제 #16
0
    def get_data(self, show=False):
        p_split = int(np.floor(self.capacity * self.split_perc) - self.n_imgs)

        if self.train:
            index_choice = np.random.choice(self._indeces[:p_split], self.batch_size)
        else:
            index_choice = np.random.choice(self._indeces[p_split:], self.batch_size)

        for i, idx in enumerate(index_choice):
            self.img_stack[i][0:self.n_imgs] = np.array([pre_process(img) for img in self._img[idx:idx+self.n_imgs]])
            self.lab_stack[i] = self._labs[idx]
            if show:
                for j in range(self.n_imgs):
                    cv2.imshow("image", np.expand_dims(self._img[idx+j], 2))
                    cv2.waitKey(10)

        return self.img_stack, self.lab_stack
예제 #17
0
    def test_process_one_review__testing(self):
        original_classification = 0
        Y_true = []
        Y_pred = []
        ignored_non_english = [0]
        # review = "CONTAINS SPOILERS!<br /><br />First off, when I have been a HUGE Sookie fan." \
        #          "  I loved these books and have read them over and over." \
        #          "  When I just picked up Deadlocked, I was like, what the heck!!!" \
        #          "  This isn't nearly long enough!<br /><br />" \
        #          "I felt like the book was filled with fluff and the plot line was thin." \
        #          "  So much time was spent on Sookie doing chores. " \
        #          " Perhaps the author wanted us to see her doing every day things?<br /><br" \
        #          " />Like other reviewers said, if you like Eric, you won't like this book. " \
        #          " I don't understand what happened to his character so fast. " \
        #          " The majority of the series was built of the Sookie/Eric tension and then they got to a great relationship. " \
        #          " I often found Sookie annoying, but Eric's character kept yanking me back with his humor and whit. " \
        #          " This book makes him look terrible." \
        #          "  I feel let down in that so many years were invested in getting to see him become a better person and love Sookie, " \
        #          "just to have them appearing to go their separate ways.<br /><br />There was too much Bill in this book. " \
        #          " We went books and books without Bill, and now he was in this book so much.<br /><br />" \
        #          "I felt from the very beginning Sookie would end up with Sam." \
        #          "  I really didn't want this to happen.  " \
        #          "Couldn't the girl just have a close guy friend?" \
        #          "  I was hoping she'd use the cluviel dor to make Eric human. " \
        #          " Perhaps he wouldn't have wanted it, but if she was being such a baby about it," \
        #          " why didn't she at least talk to him about it?<br /><br />Anyway," \
        #          " I felt the author was really trying to wrap things up in this book," \
        #          " getting ready for the last one next year." \
        #          "  I thought things were too rushed and I don't like where things are going." \
        #          "  The Fae left too quickly.. Anyways, this is my opinion," \
        #          " I hope the next book is longer (without the fluff) and leaves us satisfied.." \
        #          " but I'm thinking I'm going to be disappointed with that one as well."
        review = 'It is not great'
        pre_processed_review = pp.pre_process(text=review,
                                              nlp=nlp,
                                              as_sentence=False)

        bl.process_one_review_pp(
            original_classification=original_classification,
            pre_processed_review=pre_processed_review,
            ignored_non_english=ignored_non_english,
            Y_true=Y_true,
            Y_pred=Y_pred)
        expected = [0], [0]
예제 #18
0
def preproc_user_input(txt, model):
    """
    Applies the preprocessing steps to user input.
    
    Args:
        txt (string): a string representing user input
        model (Gensim trained model object): a trained Gensim Word2vec model
    
    Returns:
        string: a preprocessed user input string
        
    preproc_user_input applies the same preprocessing that was applied to the catalog. 
    However, it also applies additional preprocessing by removing words that are not
    in the model vocabulary.
    
    """
    txt = pre_process(txt)
    txt_tokenized = [word for word in txt.split(" ") if word in model.wv.vocab]
    return " ".join(txt_tokenized)
예제 #19
0
    def test_process_one_review__negative_hard(self):
        original_classification = 0
        Y_true = []
        Y_pred = []
        ignored_non_english = [0]
        #TODO good study case for the word worship
        review = 'Very artistic quilts shown, no detailed instructions.  ' \
                 'More on the theory of making memory quilts.  ' \
                 'I was diappointed in the book. I had expected' \
                 ' moreinstructions on the actualy making the quilat'
        pre_processed_review = pp.pre_process(text=review, nlp=nlp)
        bl.process_one_review_pp(
            original_classification=original_classification,
            pre_processed_review=pre_processed_review,
            ignored_non_english=ignored_non_english,
            Y_true=Y_true,
            Y_pred=Y_pred)
        expected = [0], [1]

        self.assertEqual(expected, (Y_true, Y_pred))
예제 #20
0
def svm_train(sc, top_path, stopwords_dict=None):
    #   留个词词典接口,如果有新的词典,把词典放到该目录下
    curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
    if stopwords_dict is None:
        stopwords = set(read_file(os.path.join(curpath, u"stopwords.txt")).split())
    else:
        stopwords = set(read_file(os.path.join(curpath, u"stopwords_dict.txt")).split())

    #   形成两类的文件夹的每个文本分词,去停用词,词频统计结果{'pos':[counter,..],'neg':[counter]}

    sub_folder = os.listdir(top_path)
    if len(sub_folder) != 2:
        raise OSError("need and only need two folder")

    top_folder_dict = {}
    for name in sub_folder:
        top_folder_dict[name] = pre_process(os.path.join(top_path, name), stopwords)

    #   选出两类直接区分度最大的词作为这两类的特征词集
    topk = 500
    features = feature_selection(top_folder_dict[sub_folder[1]], top_folder_dict[sub_folder[0]], topk)

    #   计算两类的IDF
    IDF = idf(top_folder_dict[sub_folder[1]], top_folder_dict[sub_folder[1]], features)

    #   每一类每一篇文本在指定二分类下的向量表示[(),()...]
    vector1 = {'1.0': feature_vector(tf(top_folder_dict[sub_folder[1]], features), IDF)}
    vector0 = {'0.0': feature_vector(tf(top_folder_dict[sub_folder[0]], features), IDF)}

    #   转为Spark所需要的输入格式[Labpoint(0.0,[]),...]
    labpoint1 = [LabeledPoint(1.0, list) for list in vector1['1.0']]
    labpoint0 = [LabeledPoint(0.0, list) for list in vector0['0.0']]
    train_data = labpoint1 + labpoint0

    classifier = SVMWithSGD.train(sc.parallelize(train_data))

    path = os.path.join(curpath, 'svm_' + sub_folder[1] + '_' + sub_folder[0] + '.pkl')
    if os.path.isfile(path): os.remove(path)

    with open(path, 'wb') as output:
        pickle.dump((features, IDF, classifier), output)
예제 #21
0
파일: lsa.py 프로젝트: lmy86263/pLSA
def lsa(X, number_of_topics):
    reduced_u, reduced_sigma, reduced_v = reduce_dimension(X, number_of_topics)
    word_topic_matrix = np.dot(reduced_u, reduced_sigma)
    topic_doc_matrix = np.dot(reduced_sigma, reduced_v)

    app_X = np.dot(np.dot(reduced_u, reduced_sigma), reduced_v)
    return word_topic_matrix, topic_doc_matrix, app_X


if __name__ == '__main__':
    files = glob.glob('./text/*.txt')
    documents = []
    for f in files:
        documents.append(read_data(f))

    documents, words = pre_process(documents)
    X = word_doc_matrix(words, documents)

    word_topic_matrix, topic_doc_matrix, app_X = lsa(X, 5)
    print(word_topic_matrix)
    print(topic_doc_matrix)
    print(app_X)








예제 #22
0
def naive_bayes_algorithm(reviews1245, star1245, freq12, freq45, vocabulary, voc_common, prior_neg, prior_pos):
    
    true_negative = 0
    false_negative = 0
    true_positive = 0
    false_positive = 0
    else_indicator = 0
    
    false_positive_list = []
    false_negative_list = []
    else_list = []
    
    for ii, test_review in enumerate(reviews1245):

        target = test_review
        pre_processed = pp.pre_process(target, nlp)
        freq_target, most_common_target = frequency(pre_processed)
        
        prob_neg = 1
        prob_pos = 1
        
        words12_fullset = sum_freq(freq12)
        #print("total negative words:", words12_fullset)
        words45_fullset = sum_freq(freq45)
        #print("total positive words:", words45_fullset)
        
        for jj, word_pos in enumerate(freq_target):
            
            train_freq12 = freq12[word_pos]
            #print("negative frequency:", train_freq12)
            train_freq45 = freq45[word_pos]
            #print("positive frequency:", train_freq45)
            #current_freq = freq_target[word_pos]
            current_freq = 1 #COUNT WORDS ONLY ONCE
        
            
            #print("vocabulary:", len(vocabulary))
            if train_freq12 == 0 or train_freq45 == 0:
                curreny_prob_neg = 1
                curreny_prob_pos = 1
            else:
                curreny_prob_neg = (train_freq12 + current_freq) / (words12_fullset + len(vocabulary))
                curreny_prob_pos = (train_freq45 + current_freq) / (words45_fullset + len(vocabulary))
            
            #print("current_prob_neg:", curreny_prob_neg)
            prob_neg = prob_neg*curreny_prob_neg
            #print("prob_neg:", prob_neg)
            
            #print("current_prob_pos:", curreny_prob_pos)
            prob_pos = prob_pos*curreny_prob_pos
            #print("prob_pos:", prob_pos)
        
        neg = prior_neg*prob_neg
        pos = prior_pos*prob_pos
        
        #Print("Total negative score:", neg)
        #print("Total positive score:", pos)
        
        # ------------------ EVALUATION ---------------------- 
        if neg > pos and star1245[ii] == '1':
            true_negative += 1
            #print('Negative score:', neg)
            #print('Posivite score:', pos)
            #print('Star rating:', star_rating_list[ii])
        elif neg > pos and star1245[ii] == '2':
            true_negative += 1
            #print('Negative score:', neg)
            #print('Posivite score:', pos)
            #print('Star rating:', star_rating_list[ii])            
            
        elif neg > pos and star1245[ii] == '4':
            false_negative += 1
            print("---- FALSE NEGATIVE ----")
            print(target)
            #print('Negative score:', neg)
            #print('Posivite score:', pos)
            print('Star rating:', star1245[ii])
        elif neg > pos and star1245[ii] == '5':
            false_negative += 1
            print("---- FALSE NEGATIVE ----")
            print(target)
            #print('Negative score:', neg)
            #print('Posivite score:', pos)
            print('Star rating:', star1245[ii])
            
        elif neg < pos and star1245[ii] == '1':
            false_positive += 1
            print("---- FALSE POSITIVE ----")
            print(target)
            #print('Negative score:', neg)
            #print('Posivite score:', pos)
            print('Star rating:', star1245[ii]) 
        elif neg < pos and star1245[ii] == '2':
            false_positive += 1
            print("---- FALSE POSITIVE ----")
            print(target)
            #print('Negative score:', neg)
            #print('Posivite score:', pos)
            print('Star rating:', star1245[ii])
            
        elif neg < pos and star1245[ii] == '4':
            true_positive += 1
            #print('Negative score:', neg)
            #print('Posivite score:', pos)
            #print('Star rating:', star_rating_list[ii])
        elif neg < pos and star1245[ii] == '5':
            true_positive += 1
            #print('Negative score:', neg)
            #print('Posivite score:', pos)
            # print('Star rating:', star_rating_list[ii])

        
    return true_positive, true_negative, false_positive, false_negative, false_negative_list, false_positive_list, else_indicator, else_list
예제 #23
0
def emotive_speech(x, fs, typeOfEmotion):
    CHUNK_SIZE = 1024
    NUM_BLOCKS = int(np.ceil(len(x) / CHUNK_SIZE))
    SAMPLE_PERIOD = 1 / float(fs) * CHUNK_SIZE
    TIME_STAMPS = (np.arange(0, NUM_BLOCKS - 1) * (CHUNK_SIZE / float(fs)))
    QFACTOR = 1
    #---------------------Analysis---------------------------------------#
    data_in_blocks = alysis.data_blocks(x, CHUNK_SIZE)
    fundamental_frequency_in_blocks = alysis.pitch_detect(x, fs, CHUNK_SIZE)
    voiced_unvoiced_starting_info_object = alysis.starting_info(
        x, fundamental_frequency_in_blocks, fs, CHUNK_SIZE)
    voiced_samples = voiced_unvoiced_starting_info_object['VSamp']
    voiced_regions = alysis.voiced_regions(
        x, fundamental_frequency_in_blocks,
        voiced_unvoiced_starting_info_object, CHUNK_SIZE)
    consecutive_blocks = 1 + int(0.5 / SAMPLE_PERIOD)

    #---------------------preprocess-------------------------------------#
    inflection_voice_samples = prep.pre_process(voiced_samples)
    frequency_of_voiced_samples = fundamental_frequency_in_blocks[
        inflection_voice_samples]
    rms = prep.root_mean_square(x, CHUNK_SIZE, fs)[0]
    frequency_for_inflection = prep.potential_inflection_fundamental_frequency(
        frequency_of_voiced_samples)
    inflection_sample_numbers = prep.matrix_of_sample_numbers(
        rms, inflection_voice_samples)
    inflect_blocks = prep.consecutive_blocks_for_inflection(
        inflection_sample_numbers, consecutive_blocks)
    selected_inflect_block = prep.alteration_of_discrete_data(
        inflection_sample_numbers, consecutive_blocks, inflect_blocks)
    n = prep.consecutive_blocks_in_selected_blocks(selected_inflect_block,
                                                   consecutive_blocks)
    reshaped_inflect_blocks = prep.reshaped_inflection_blocks(
        n, selected_inflect_block, consecutive_blocks)
    differece_arrays = prep.difference_arrays(NUM_BLOCKS,
                                              reshaped_inflect_blocks)

    #----------------------synthesis-------------------------------------#

    if typeOfEmotion == "Happy":
        consecutive_blocks = 1 + int(0.5 / SAMPLE_PERIOD)
        selected_inflect_block = prep.alteration_of_discrete_data(
            inflection_sample_numbers, consecutive_blocks, inflect_blocks)
        utterance_time_stamps = TIME_STAMPS[selected_inflect_block]

        gain = 3.0
        semitones = 0.5
        synth.happy_patch(fs, semitones, QFACTOR, gain, utterance_time_stamps)

    if typeOfEmotion == "HappyTensed":
        consecutive_blocks = int(0.5 / SAMPLE_PERIOD)
        inflection_sample_numbers = prep.matrix_of_sample_numbers(
            rms, inflection_voice_samples)
        inflect_blocks = prep.consecutive_blocks_for_inflection(
            inflection_sample_numbers, consecutive_blocks)
        selected_inflect_block = prep.alteration_of_discrete_data(
            inflection_sample_numbers, consecutive_blocks, inflect_blocks)
        utterance_time_stamps = TIME_STAMPS[selected_inflect_block]

        gain = 3.0
        semitones = 1.0
        synth.happy_tensed_patch(fs, semitones, QFACTOR, gain,
                                 utterance_time_stamps)

    if typeOfEmotion == "Sad":
        gain = 0.25
        semitones = -0.5
        synth.sad_patch(fs, semitones, QFACTOR, gain)

    if typeOfEmotion == "Afraid":
        speed = 8.5
        depth = 50
        utterance_time_stamps = TIME_STAMPS[selected_inflect_block]
        synth.afraid_patch(fs, speed, depth, utterance_time_stamps)
예제 #24
0
    def predict(self, X_test, time_remain):
        timer = Timer()
        timer.set(time_remain)
        with timer.time_limit('ProProcess'):
            # fetch information of test dataset
            self.config[TEST_DATA_LENGTH] = len(X_test)
            self.config['test_time'] = self._fectch_time_range(X_test)
            self.config[STAGE] = 'test'

            Xs = self.tables
            main_table = pd.concat([Xs[MAIN_TABLE_NAME], X_test],
                                   axis=0,
                                   copy=False)
            main_table.reset_index(drop=True, inplace=True)

            del Xs[MAIN_TABLE_NAME]
            Xs[MAIN_TABLE_NAME] = main_table

            pre_process(Xs, self.config)
            clean_tables(Xs)
            pre_feature_extract(Xs)
            pre_tables_memory_cut(Xs)

            X = merge_table(Xs, self.config)
            # clean datas
            del self.tables, Xs
            gc.collect()

            self.null_count_sum(X, self.config)
            clean_df(X, fill_time=True)
            # compress data for memory problem
            X = table_memory_cut(X)

            # feature engineering
            print('overall X size', X.shape)
            X, add_feature = feature_engineer(X, self.config)

            # 内存问题 11G
            X = table_memory_cut(X)
            add_feature = table_memory_cut(add_feature)
            X = pd.concat([X, add_feature], axis=1, copy=False)
            del add_feature
            print(X.shape)
            # re compress data

            # 测试集分割
            X_train_val, y_train_val = X.iloc[:self.config[
                TRAIN_DATA_LENGTH]], self.train_label
            X_test = X.iloc[self.config[TRAIN_DATA_LENGTH]:]

            train_len = int(self.config[TRAIN_DATA_LENGTH] * 0.8)
            valid_len = self.config[TRAIN_DATA_LENGTH] - train_len
            self.config[TRAIN_LEN_OF_TRAIN_VAL] = train_len
            self.config[VAL_LEN_OF_TRAIN_VAL] = valid_len
            del X
            gc.collect()

            # 特征处理
            all_label_count_feature_list = cat_Lable_Cnt_Fun(
                X_train_val, y_train_val, X_test, self.config)
            all_mutlicat_feature_data_list = Mv_Label_Cnt_Func(
                X_train_val, y_train_val, X_test, self.config)

            if (all_label_count_feature_list is
                    None) & (all_mutlicat_feature_data_list is None):
                X_train, y_train = X_train_val.iloc[:
                                                    train_len], self.train_label[:
                                                                                 train_len]
                X_val, y_val = X_train_val.iloc[train_len:], self.train_label[
                    train_len:]
            else:
                all_feature_list = []
                if all_label_count_feature_list is not None:
                    all_feature_list += all_label_count_feature_list
                if all_mutlicat_feature_data_list is not None:
                    all_feature_list += all_mutlicat_feature_data_list

                add_feature_data = pd.concat(all_feature_list,
                                             axis=1,
                                             copy=False)
                add_feature_data.sort_index(inplace=True)

                del all_label_count_feature_list, all_mutlicat_feature_data_list, all_feature_list
                gc.collect()

                X_train = pd.concat(
                    [X_train_val[:train_len], add_feature_data[:train_len]],
                    axis=1,
                    copy=False)
                X_val = pd.concat([
                    X_train_val[train_len:self.config[TRAIN_DATA_LENGTH]],
                    add_feature_data[train_len:self.config[TRAIN_DATA_LENGTH]]
                ],
                                  axis=1,
                                  copy=False)
                y_train = self.train_label[:train_len]
                y_val = self.train_label[train_len:]

                X_test = pd.concat([
                    X_test, add_feature_data[self.config[TRAIN_DATA_LENGTH]:]
                ],
                                   axis=1,
                                   copy=False)

                del X_train_val, y_train_val, add_feature_data, self.train_label
                gc.collect()

        train_columns = train(X_train, X_val, y_train, y_val, self.config,
                              timer.remain)
        del X_train, X_val, y_train, y_val
        gc.collect()

        result = predict(X_test[train_columns], self.config)

        return pd.Series(result)
예제 #25
0
import numpy as np
from tsne import bh_sne
from gensim.models.keyedvectors import KeyedVectors
from sklearn.preprocessing import normalize
from matplotlib import pyplot as plt

from preprocess import data_load, pre_process

input = '../dat/data/18000*.json'
word2vec_model = '../trained_models/titles_wp_model_dim_300_maxn_6_minCount_5_minn_3_wordNgrams_3_ws_5.vec'

# load raw_data
raw_data = data_load(input)

# pre_process data
df, fts = pre_process(raw_data)

# pre-trained model from fasttext
model_ft = KeyedVectors.load_word2vec_format(word2vec_model)

raw_list = list(df.products.value_counts()[:80].index)
raw_list.extend(
    ['playstation', 'xbox', 'iphone', 'xiaomi', 'console', 'cat', 'dog'])
word_list = []
filter_list = []
for i in range(len(raw_list)):
    tokens = re.split(' and |\s', raw_list[i])
    if all([token in model_ft.vocab for token in tokens]):
        word_list.append(tokens)
        filter_list.append(raw_list[i])
예제 #26
0
def query(query, no_of_docs):
    query = expand_query(query)
    query = pre_process(query)

    #read persistant storage

    pickle_in = open('serial.txt', 'rb+')
    db = pickle.load(pickle_in)
    pickle_in.close()

    #get data that we stored in the map

    words = db['words']
    N = db['N']
    documents_vector = db['documents_vector']
    dfs = db['dfs']

    #generate query vector

    query_vector = []

    for word in words:
        #calculate document frequency
        tf = 0
        for term in query:
            if term == word:
                tf = tf + 1

        df = 1 if dfs[word] == 0 else dfs[word]

        idf = math.log(N / df)
        tfidf = tf * idf
        query_vector.append(tfidf)

    #calculate cosine similarity

    scores = []
    for vector in documents_vector:
        score = np.dot(vector, query_vector)
        scores.append(score)

    document_scores = []

    dir = PROBS_DIRS
    file_dir = os.listdir(dir)
    file_dir.sort()

    for x in range(len(file_dir)):
        document_scores.append((scores[x], file_dir[x]))

    document_scores.sort(reverse=True)

    #return no_of_docs documents
    result = []

    if len(document_scores) < no_of_docs:
        for x in range(len(document_scores)):
            if document_scores[x][0] != 0:
                result.append(document_scores[x][1])
    else:
        for x in range(no_of_docs):
            if document_scores[x][0] != 0:
                result.append(document_scores[x][1])

    print(result)
    return result
예제 #27
0
파일: rex.py 프로젝트: zhz46/system2
# assign weight, could pass in by command line argv later
model_pars = {'weight': {'title_wt': 0.85,
                         'prod_wt': 0,
                         'image_wt': 0.15,
                         'brand_wt':0,
                         'price_wt':0},
              'method': 'tfidf_word2vec',
              'title_dim': 300}


# load raw_data
raw_data = data_load(text_input)

# pre_process data
df = pre_process(raw_data)

# load model
if model_pars['method'] in ['mean_word2vec', 'tfidf_word2vec']:
    model_path = word2vec_model
    model_ft = KeyedVectors.load_word2vec_format(model_path)
    # filter out empty bags of word
    df = df_filter(df, model_ft)
if model_pars['method'] == 'dm':
    model_path = doc2vec_model
    model_ft = Doc2Vec.load(model_path)

# inner join text and image data
if model_pars['weight']['image_wt'] != 0:
    df = image_merge(df, image_input)
예제 #28
0
def main():

    arg: dict = process_arg()

    # --------------
    # read dataset
    # --------------
    df: pandas.DataFrame = pandas.read_csv(arg['train_path'], header=None)
    # df = preprocess.pre_process(df)

    # train set,
    num_ex = arg['train_percent'] * len(df) / 100
    num_ex = int(num_ex)
    df_train = df[:num_ex]
    df_train = preprocess.pre_process(df_train)
    print('train shape:', df_train.shape)

    # validation set
    num_ex = arg['valid_percent'] * len(df) / 100
    num_ex = int(num_ex)
    df_valid = df[-num_ex:]
    df_valid = preprocess.pre_process(df_valid)
    print('valid shape:', df_valid.shape)

    # test set
    df_test: pandas.DataFrame = pandas.read_csv(arg['test_path'], header=None)
    df_test = preprocess.pre_process(df_test)
    print('test shape:', df_test.shape)

    # --------------
    # build tree
    # --------------

    decision_tree = DecisionTree(train_set=df_train,
                                 valid_set=df_valid,
                                 test_set=df_test)

    print('build tree option:', arg['option'])
    if arg['option'] != 'maxDepth':
        decision_tree.build_normal()
    else:
        decision_tree.build_max_depth(arg['max_depth'])
        valid_accur = decision_tree.validate()
        print("validation set accuracy: %.4f" % valid_accur)

    # prune tree
    n_nodes = get_num_nodes(decision_tree.root)
    print("Number of nodes before prune:", n_nodes)
    h = get_height(decision_tree.root)
    print("height before prune:", h)

    if arg['option'] == 'prune':
        decision_tree.prune_tree()

    # --------------
    # testing
    # --------------

    train_correct = 0
    for index, row in df_train.iterrows():
        output = decision_tree.predict(row=row)
        if output == row[index_col]:
            train_correct += 1

    test_correct = 0
    for index, row in df_test.iterrows():
        output = decision_tree.predict(row=row)
        if output == row[index_col]:
            test_correct += 1

    train_accuracy = train_correct / len(df_train)
    test_accuracy = test_correct / len(df_test)
    print("Train set accuracy: %.4f" % train_accuracy)
    print("Test set accuracy: %.4f" % test_accuracy)
예제 #29
0
################################################################################
                              #SCRIPT
################################################################################
raw_data = rc.read_dataset(infile)
results_df =  pd.DataFrame(columns=('model_type', 'validation_date', 'clf', 'parameters', 'auc-roc', \
                                    'p_at_1', 'p_at_2', 'p_at_5', 'p_at_10', 'p_at_20', 'p_at_30', 'p_at_50', \
                                    'r_at_1', 'r_at_2', "r_at_5", "r_at_10", 'r_at_20', 'r_at_30', 'r_at_50', \
                                    'f1_at_1', 'f1_at_2', "f1_at_5", "f1_at_10", 'f1_at_20', 'f1_at_30', 'f1_at_50',
                                     'baseline', 'len_x_train'))

clfs, grid = ml.define_clfs_params(grid_size)

for validation_date in validation_dates:
    train_set, validation_set = ml.temporal_split(raw_data, temporal_split_date_var, validation_date, 6, 60)

    #preprocess the train_set and test_set separately
    train_set = pre.pre_process(train_set, dummy_vars, boolean_vars, vars_not_to_include, columns_to_datetime)
    validation_set = pre.pre_process(validation_set, dummy_vars, boolean_vars, vars_not_to_include, columns_to_datetime)

    #create features - there will be features in the train that don't exist in test and vice versa
    #the model will only actually use the union of the two.
    train_features  = list(train_set.columns)
    test_features = list(validation_set.columns)

    #find union of the two lists
    intersection_features = list(set(train_features) & set(test_features))
    intersection_features.remove(prediction_var)

    #run the loop and save the output df
    results_df = ml.clf_loop(train_set, validation_set, intersection_features, prediction_var, models_to_run, clfs, grid, results_df, validation_date, "output/intermediate.csv")
#    1. ignore words that appear in 85% of documents,
#    2. eliminate stop words
#    3. limit vocabulary
cv = CountVectorizer(max_df=0.85, stop_words=stopwords, max_features=10000)
word_count_vector = cv.fit_transform(docs)

# print("Vocabulary: ", list(cv.vocabulary_.keys())[:50])
# print(list(cv.get_feature_names())[2000:2015])

tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector)

# get input data
data_in = pd.read_json("../dist/hn.json", lines=True)
data_in = preprocess.combine_text(data_in)
data_in['text'] = data_in['text'].apply(lambda x: preprocess.pre_process(x))

stories = data_in['text'].tolist()
feature_names = cv.get_feature_names()

results = []
for story in stories:
    # generate tf-idf for the given document
    tf_idf_vector = tfidf_transformer.transform(cv.transform([story]))

    # sort the tf-idf vectors by descending order of scores
    sorted_items = sort(tf_idf_vector.tocoo())

    # extract only the top n; n here is 10
    keywords = extract_topn_from_vector(feature_names, sorted_items, 10)
예제 #31
0
def single_review(test_review, star_rating, prior_neg, prior_pos):
      
    true_negative = 0
    false_negative = 0
    true_positive = 0
    false_positive = 0
    
    false_positive_list = []
    false_negative_list = []
    
    target = test_review
    pre_processed = pp.pre_process(target, nlp)
    freq_target, most_common_target = frequency(pre_processed)
        
    prob_neg = 1
    prob_pos = 1
        
    words12_fullset = sum_freq(freq12)
    #print("total negative words:", words12_fullset)
    words45_fullset = sum_freq(freq45)
    #print("total positive words:", words45_fullset)
        
    for jj, word_pos in enumerate(freq_target):
        
        print("------ New Target -------")
        print(word_pos)
        
        train_freq12 = freq12[word_pos]
        #print("negative frequency:", train_freq12)
        train_freq45 = freq45[word_pos]
        #print("positive frequency:", train_freq45)
        #current_freq = freq_target[word_pos]
        current_freq = 1 #COUNT WORDS ONLY ONCE
    
        
        #print("vocabulary:", len(vocabulary))
        
        
        if train_freq12 == 0 or train_freq45 == 0:
            curreny_prob_neg = 1
            curreny_prob_pos = 1
        else:
            curreny_prob_neg = (train_freq12 + current_freq) / (words12_fullset + len(vocabulary))
            curreny_prob_pos = (train_freq45 + current_freq) / (words45_fullset + len(vocabulary))
        
        print("current_prob_neg:", curreny_prob_neg)
        prob_neg = prob_neg*curreny_prob_neg
        #print("prob_neg:", prob_neg)
        
        print("current_prob_pos:", curreny_prob_pos)
        prob_pos = prob_pos*curreny_prob_pos
        #print("prob_pos:", prob_pos)

    neg = prior_neg*prob_neg
    pos = prior_pos*prob_pos
    
    print("negative_score:", neg)
    print("positive_score:", pos)
    
    if neg > pos and star_rating == '1':
        true_negative += 1

    elif neg > pos and star_rating == '2':
        true_negative += 1
        
    elif neg > pos and star_rating == '4':
        false_negative += 1
        print("---- FALSE NEGATIVE ----")
        false_negative_list.append(target)

    elif neg > pos and star_rating == '5':
        false_negative += 1
        print("---- FALSE NEGATIVE ----")
        false_negative_list.append(target)
        
    elif neg < pos and star_rating == '1':
        false_positive += 1
        print("---- FALSE POSITIVE ----")
        false_positive_list.append(target)

    elif neg < pos and star_rating == '2':
        false_positive += 1
        print("---- FALSE POSITIVE ----")
        false_positive_list.append(target)
        
    elif neg < pos and star_rating == '4':
        true_positive += 1

    elif neg < pos and star_rating == '5':
        true_positive += 1
            
    return true_positive, true_negative, false_positive, false_negative