示例#1
0
    def is_English(text=None):
        if text is not None:
            try:
                text.encode('ascii')
            except UnicodeEncodeError:
                # print("it was not a ascii-encoded unicode string")
                return False
            else:
                # print("It may have been an ascii-encoded unicode string")
                return True
        else:
            # print('The input string is None.')
            return

    if is_English(text):
        return cnn(text)
    else:
        return cnn_Chinese(text)
if __name__ == "__main__":
    from load_data import load_corpus

    data = load_corpus('../data/corpus/cn/corpus_raw/')
    for i in data:
        print(''.join(i))
        print(CNN_VA_prediction(''.join(i)))
    exit()
    text = '我今天特別高興'
    print(CNN_VA_prediction(text))

    text = 'appy B-day Jim Price!! :-) (you are more awesome than you could dream) Hope today was the best ever!!  :-D'
    print(CNN_VA_prediction(text))
示例#2
0
import numpy as np


def get_vocab(corpus):
    vocab = defaultdict(float)
    for sent in corpus:
        for word in sent:
            vocab[word] += 1
    print(len(vocab))
    return vocab

# 注意: 这个文件就是CVAT构造cnn输入数据的代码
########################################## config ########################################
vec_dim = 400
##########################################################################################
corpus = load_corpus(get_file_path('cn_corpus'))
print(corpus[:2])
vocab = get_vocab(corpus)
dump_picle(vocab, get_file_path('CVAT_Vocab'))
print('Dump CVAT vocab OK')
# vocab = load_pickle(get_file_path('CVAT_Vocab'))
for i in vocab:
    print(i)
print(len(vocab))

W, word_idx_map = build_embedding_matrix(load_embeddings('zh_tw'), vocab, k=400)
dump_picle(word_idx_map, get_file_path('word_idx_map_CVAT'))
print('dump word_idx_map successful')
dump_picle(W, './data/tmp/embedding_matrix_CVAT.p')
print('OK')
def cv(data, target, multivariant=False):
    X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(data, target, test_size=0.1, random_state=10)
    if multivariant is False:
        linear_regression(X_train, X_test, Y_train, Y_test, plot=False)
    else:
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='ordinary_least_squares')
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Ridge_Regression')
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Bayesian_Regression')
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='SVR')
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='KNN_Reg')


if __name__ == '__main__':
    normalize = True
    corpus = load_corpus(get_file_path('cn_corpus'))
    # lexicon = load_lexicon(get_file_path('lexicon'))
    mark = load_mark(get_file_path('mark'))
    lexicon = combine_lexicon(get_file_path('lexicon'), get_file_path('neural_cand'))

    # # the following could use to check the same words in corpus and lexicon
    # from visualization import show_common_term
    # show_common_term(corpus, lexicon)
    # exit()

    valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion(corpus, lexicon, mark)
    print('start.....')
    cv(valence_mean, valence_true, multivariant=False)
    cv(arousal_mean, arousal_true, multivariant=False)
    print('OK')
    logging.root.setLevel(level=logging.INFO)
    logger.info(r"running %s" % ''.join(sys.argv))

    corpus_name = get_file_path('cn_corpus')
    logger.info(r"loading corpus from : " + corpus_name)

    lexicon_name = get_file_path('lexicon')
    logger.info(r"loading lexicon form : " + lexicon_name)

    expand_name = get_file_path('neural_cand')
    logger.info(r"loading expand_word from : " + expand_name)

    mark_name = get_file_path('mark')
    logger.info(r"loading mark from : " + mark_name)

    corpus = load_corpus(corpus_name)
    lexicon = load_lexicon(lexicon_name)
    mark = load_mark(mark_name)
    # log_state('use extend lexicon')
    lexicon = combine_lexicon(lexicon_name, expand_name)

    log_state('mean')
    evaluate_mean(corpus, lexicon, mark)
    log_state('tf_mean')
    evaluate_tf_mean(corpus, lexicon, mark)
    log_state('tfidf_mean')
    evaluate_tfidf_mean(corpus, lexicon, mark)

    log_state('geo')
    evaluate_geo(corpus, lexicon, mark)
    log_state('tfidf_geo')
示例#5
0
#这是用来给youtube2016那篇论文的模型构造数据用的,我也跑了一遍,但是效果奇差。。。
import tensorflow as tf
import numpy as np
import pandas as pd
import random
from functiontool.getrelation import getrelation
from functiontool.dirkit import getdir
from functiontool.baseinfo import getlive
from sklearn import metrics
import math
from load_data import load_corpus, index_item, get_live_vec

sentences, words, validatewords, user = load_corpus('corpus.txt',
                                                    usertxt='./sample/dir.txt')
assert len(sentences) == len(user)
index2item, item2index, vocabulary = index_item(words)
user_num = len(sentences)
item_num = len(words)
vocabulary_size = len(vocabulary)
print(vocabulary_size)
# sentences : 序列中的元素为live原始id
# words:  按热度排序的live列表,带有词频率,所以是个元组
# validatewords:  选取最后的几个作为抹去的对象,这些对象作为验证集使用
# user: 和sentences索引对应的用户id列表
# vocabulary: 按词频排序的列表

#已经获取了所有的索引信息,至此,数据集索引加载完成,且没有划分
print('所有内容报名完成,下面开始加载数据')

#下面加载基本信息
示例#6
0
    # plot hourly distribution for different parts of the week
    hist_comb = hist_hourly_weekday_weekend(df)
    plot_custom(hist_comb,
                'line',
                title='Hourly Submissions Over the Weekdays',
                file='hist_hourly_weekday_weekend',
                xlabel='Hour',
                ylabel='Count',
                colors=['tab:red', 'tab:green'],
                show=False)

    # word corpus analysis
    df_collection = list()

    # individual corpus analysis plot
    stress_words = load_corpus('data/stress_corpus.txt')
    df_result = keywords_over_quarters(
        stress_words,
        calendar,
        df,
        title='Stress & Depression over typical Quarter',
        file='plot_stress',
        colors=['lightgrey', 'tab:red', 'tab:green'],
        show=False)
    df_collection.append(('Stress/Depression', df_result))

    thirst_words = load_corpus('data/thirst_corpus.txt')
    df_result = keywords_over_quarters(
        thirst_words,
        calendar,
        df,
示例#7
0
 def __init__(self, file_dir):
     self.file_dir = file_dir
     self.corpus = load_corpus(file_dir)
def cv(data, target, multivariant=False):
    X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(data, target, test_size=0.1, random_state=10)
    if multivariant is False:
        linear_regression(X_train, X_test, Y_train, Y_test, plot=False)
    else:
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="ordinary_least_squares")
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="Ridge_Regression")
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="Bayesian_Regression")
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="SVR")
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="KNN_Reg")


if __name__ == "__main__":
    normalize = True
    corpus = load_corpus(get_file_path("cn_corpus"))
    # lexicon = load_lexicon(get_file_path('lexicon'))
    mark = load_mark(get_file_path("mark"))
    lexicon = combine_lexicon(get_file_path("lexicon"), get_file_path("neural_cand"))

    # # the following could use to check the same words in corpus and lexicon
    # from visualization import show_common_term
    # show_common_term(corpus, lexicon)
    # exit()

    valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion(corpus, lexicon, mark)
    print("start.....")
    cv(valence_mean, valence_true, multivariant=False)
    cv(arousal_mean, arousal_true, multivariant=False)
    print("OK")
#基本信息的标准化,之后存入csv

from load_data import load_corpus,index_item
import pandas as pd 
import numpy as np
from functiontool.baseinfo import getlive
sentences,words,validatewords,user = load_corpus('corpus_all.txt',usertxt='dir.txt')
assert len(sentences)==len(user)
index2item,item2index,vocabulary=index_item(words)
vocabulary_size=len(vocabulary)
future_batch= 5
print(vocabulary_size)
print(user[:30])
# sentences : 序列中的元素为live原始id
# words:  按热度排序的live列表,带有词频率,所以是个元组
# validatewords:  选取最后的几个作为抹去的对象,这些对象作为验证集使用
# user: 和sentences索引对应的用户id列表
# vocabulary: 按词频排序的列表

#已经获取了所有的索引信息,至此,数据集索引加载完成,且没有划分
print('所有内容报名完成,下面开始加载数据')

#下面加载基本信息

def get_user_base(user):
    from sqlalchemy import create_engine
    base_engine = create_engine("mysql+pymysql://root:[email protected]:3306/zhihu", max_overflow=5)
    print('读取40万用户开始')
    df = pd.read_sql('user',base_engine)
    df.set_index(["id"], inplace=True)
    print('读取完成')
示例#10
0
 def __init__(self, file_dir):
     self.file_dir = file_dir
     self.corpus = load_corpus(file_dir)
示例#11
0
    logging.root.setLevel(level=logging.INFO)
    logger.info(r"running %s" % ''.join(sys.argv))

    corpus_name = get_file_path('cn_corpus')
    logger.info(r"loading corpus from : " + corpus_name)

    lexicon_name = get_file_path('lexicon')
    logger.info(r"loading lexicon form : " + lexicon_name)

    expand_name = get_file_path('neural_cand')
    logger.info(r"loading expand_word from : " + expand_name)

    mark_name = get_file_path('mark')
    logger.info(r"loading mark from : " + mark_name)

    corpus = load_corpus(corpus_name)
    lexicon = load_lexicon(lexicon_name)
    mark = load_mark(mark_name)
    # log_state('use extend lexicon')
    lexicon = combine_lexicon(lexicon_name, expand_name)

    log_state('mean')
    evaluate_mean(corpus, lexicon, mark)
    log_state('tf_mean')
    evaluate_tf_mean(corpus, lexicon, mark)
    log_state('tfidf_mean')
    evaluate_tfidf_mean(corpus, lexicon, mark)

    log_state('geo')
    evaluate_geo(corpus, lexicon, mark)
    log_state('tfidf_geo')