Python TextProcessor.text_process示例，text_processor.TextProcessor.text_process Python示例

示例#1

0

显示文件

文件： PtBrTwitter.py 项目： lucasant10/Twitter

class PtBrTwitter():


    def __init__(self, dir_in, dir_out):
        self.dir_in = dir_in
        self.dir_out = dir_out
        self.tw_files = ([file for root, dirs, files in os.walk(self.dir_in)
            for file in files if file.endswith('.json') ])
        self.doc_list = list()
        self.date_list = list()
        self.tp = TextProcessor()

    def read(self):
        
        for tw_file in self.tw_files:
            with open(self.dir_in+tw_file) as data_file:
                for line in data_file:
                    tweet = json.loads(line)
                    self.doc_list.append(tweet['text'])
                    self.date_list.append(tweet['created_at'])
    def tokenizeAndSave(self, file_name):
        tweets = self.tp.text_process(self.doc_list)
        tweets = list(itertools.chain.from_iterable(tweets))
        t_count = Counter(tweets)
        with open(self.dir_out+file_name, 'wb') as handle:
            pickle.dump(t_count, handle)

    def loadCounter(self, file_name):
        with open(self.dir_out+file_name, 'rb') as handle:
            t_count = pickle.load(handle)
        return t_count

示例#2

0

显示文件

    file = "/Users/lucasso 1/Documents/validation/nao_politicos.json"

    data = {
        'favorites': [],
        'user_id': [],
        'text': [],
        'retweets': [],
        'created_at': [],
        'tweet_id': [],
        'user_screen_name': []
    }

    for t in load_tweets(file):
        data['user_id'].append(t['user_id'])
        data['favorites'].append(t['favorites'])
        data['text'].append(t['text'])
        data['retweets'].append(t['retweets'])
        data['created_at'].append(t['created_at'])
        data['tweet_id'].append(t['tweet_id'])
        data['user_screen_name'].append(t['user_screen_name'])

    df = pd.DataFrame(data)
    df['created_at'] = pd.to_datetime(df['created_at'], unit='ms')
    df = df.set_index('created_at')
    df = df.sort_index(ascending=True)

    tp = TextProcessor()
    df['text_processed'] = tp.text_process(df.text.tolist(), hashtags=True)
    df['political'] = 0
    file = file.replace('json', 'pck')
    df.to_pickle(file)

示例#3

0

显示文件

文件： semantic_graph.py 项目： lucasant10/Twitter

    tp = TextProcessor()
   

    id_rep, names = rt.names_from_xls()
    
    for idx in range(len(names)):

        tweets = list()
        graphs = list()
        tw = nx.Graph()
        data = rt.tweets_election_data(id_rep[idx])
        
        diction = {k:v for (k,v) in data.items()}

        for i in diction:
            tweets.append(list(itertools.chain.from_iterable(tp.text_process(diction[i].split()))))

        for tweet in tweets:
            for u,v in itertools.combinations(tweet,2):
                if tw.has_edge(u,v):
                    tw[u][v]['weight'] += 1
                else:
                    tw.add_edge(u, v, weight=1)
                
        nx.write_gml(tw , dir_out+names[idx]+".gml")
    
    files = list()
    for file in os.listdir(dir_out):
        if file.endswith(".gml"):
            files.append(file)

示例#4

0

显示文件

文件： tfidf_month.py 项目： lucasant10/Twitter

with open(dir_out+"random-pck/coleta1.pck",'rb') as handle:
    random_pck.append(pickle.load(handle))

with open(dir_out+"random-pck/coleta2.pck",'rb') as handle:
    random_pck.append(pickle.load(handle))

month_files = list()
for m in range(10):
    month_files.append(load_files(dir_out+"tw_month/month_"+str(m)+"/"))

tp = TextProcessor()
month_processed =list()
for tw in month_files:
    tmp = list()
    for dep in tw:
        tmp.append(tp.text_process(dep,text_only=True))
    month_processed.append(tmp)

ranked_month = list()
for i,month in enumerate(month_processed):
    tmp = tfidf_month(month,random_pck)
    ranked_month.append(tmp)
    save_pck(dir_out+"tw_month/month_"+str(i)+"/",tmp)

tfidf = TfIdf()


#calcular o tfidf para cada pasta com aleatorio dependendo do tamanho de cada mes

示例#5

0

显示文件

文件： import_tw_db.py 项目： lucasant10/Twitter

if excel:

    sheet_name = "nao_eleitos"
    col = 4
    rep_dic = {}
    for fname in tw_files:
        rep_dic[fname.split('_',1)[0]] = fname
    xls = xlrd.open_workbook(dir_xls)
    sheet = xls.sheet_by_name(sheet_name)
    for i in range(sheet.nrows):
        id_rep = str(int(sheet.cell_value(rowx= i, colx=col)))
        if (id_rep in rep_dic):
            with open(dir_in+rep_dic[id_rep]) as data_file:
                print('file %s' % data_file)
                for line in data_file:
                    tweet = json.loads(line)
                    tweet['text_processed'] = ' '.join(tp.text_process([tweet['text']], text_only=True)[0])
                    tweet['cond_55'] = sheet_name
                    db.tweets.insert(tweet)

else:                    
                
    for tw_file in tw_files:
        with open(dir_in+tw_file) as data_file:
            print('file %s' % data_file)
            for line in data_file:
                tweet = json.loads(line)
                tweet['text_processed'] = ' '.join(tp.text_process([tweet['text']], text_only=True)[0])
                tweet['cond_55'] = 'nao_eleitos'
                db.tweets.insert(tweet)

示例#6

0

显示文件

文件： twitter_sanders.py 项目： lucasant10/Twitter

    f = open(dir_down + "sanders-twitter-0.2/full-corpus.csv", "rt")
    twitter = csv.reader(f, delimiter=',')

    tweets = list()
    for tw in twitter:
        tweets.append(tw)

    random.shuffle(tweets)

    topic = list()
    txt = list()
    for tw in tweets:
        topic.append(tw[0])
        txt.append(tw[4])

    txt = tp.text_process(txt, lang="english")

    with open(dir_out + "sanders_twitter.pck", 'wb') as handle:
        pickle.dump(txt, handle)

    f = open(dir_out + "sanders_twitter.txt", 'w')
    for l in txt:
        f.write(" ".join(l) + "\n")

    f.close

    f = open(dir_out + "topic_sanders_twitter.txt", 'w')
    for t in topic:
        f.write(t + "\n")

    f.close

示例#7

0

显示文件

文件： tfidf_roberta.py 项目： lucasant10/Twitter

if __name__=='__main__':

    cf = configparser.ConfigParser()
    cf.read("file_path.properties")
    path = dict(cf.items("file_path"))
    dir_in = path['dir_in']
    dir_out = path['dir_out']
    dir_ale = path['dir_ale']
    dir_rob = path['dir_rob']

    doc_list, parl_tw_list = load_files(dir_rob)
    tp = TextProcessor()

    parl_tw_processed = list()
    for l in parl_tw_list:
        parl_tw_processed.append(tp.text_process(l, text_only=True))


    with open(dir_in+"coleta1.pck",'rb') as handle:
        coleta1 = pickle.load(handle)

    with open(dir_in+"coleta2.pck",'rb') as handle:
        coleta2 = pickle.load(handle)

    tweets = list(itertools.chain.from_iterable(list(itertools.chain.from_iterable(parl_tw_processed))))
    tot_counter = Counter(tweets)


    parl_counters = list()
    for parl in parl_tw_processed:
        tw = list(itertools.chain.from_iterable(parl))

示例#8

0

显示文件

文件： cnn.py 项目： lucasant10/election

    word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(
        dir_w2v + W2VEC_MODEL_FILE, binary=False, unicode_errors="ignore")

    tp = TextProcessor()

    texts = list()
    tx_class = list()

    tmp = list()
    with open(POLITICS_FILE) as l_file:
        for line in l_file:
            tmp.append(line)
            tx_class.append('politics')

    texts += tp.text_process(tmp, text_only=True)

    tmp = list()
    with open(NON_POLITICS_FILE) as l_file:
        for line in l_file:
            tmp.append(line)
            tx_class.append('non-politics')

    texts += tp.text_process(tmp, text_only=True)

    texts = select_texts(texts)

    vocab = gen_vocab(word2vec_model)

    X, y = gen_sequence(vocab, texts, tx_class)

示例#9

0

显示文件

文件： entropy.py 项目： lucasant10/Twitter

    col = 4
    rt = ReadTwitter(dir_in, excel_path, sheet_name, col )
    tp = TextProcessor()
   

    id_rep, names = rt.names_from_xls()
    parl_words =  Counter()
    counter_list = list()
    tw_apriori = list()

    for idx in range(len(names)):
        tweets = list()
        data = rt.tweets_election_data(id_rep[idx])

        for k,v in data.items():
            tweets.append(tp.text_process(v.split()))

        tw_apriori += [[x[0] for x in e if x]  for e  in tweets if e ]
        tweets = list(itertools.chain.from_iterable(list(itertools.chain.from_iterable(tweets))))

        counter_list.append(Counter(tweets))
        parl_words.update(tweets)

    word_ent = dict()
    for word,count in parl_words.items():
        ent = 0
        for counter in counter_list:
            prob = counter[word]/count
            ent += prob * (-np.log2(prob+1e-100))
        word_ent[word] = ent
    sort = sorted(word_ent.items(), key=lambda x: x[1], reverse=True)

示例#10

0

显示文件

文件： bigrams.py 项目： lucasant10/Twitter

if __name__=='__main__':

    cf = configparser.ConfigParser()
    cf.read("file_path.properties")
    path = dict(cf.items("file_path"))
    dir_in = path['dir_in']
    dir_out = path['dir_out']
    dir_ale = path['dir_ale']
    dir_pck = path['dir_pck']

    doc_list, parl_tw_list = load_files(dir_in)
    _ ,list_aleatory = load_files(dir_ale)

    tp = TextProcessor()
    tweets = tp.text_process(doc_list,text_only=True)
    tw_words = add_separator(tweets)
    parl_bigrams = get_bigrams(tw_words,3,True)

    #processa os tweets de cada deputado
    parl_processed = list()
    parl_tri_processed = list()
    for l in parl_tw_list:
        temp = add_separator(tp.text_process(l,text_only=True))
        parl_tri_processed.append(get_trigrams(temp, 3, True))
        parl_processed.append(get_bigrams(temp,3,True))

    with open(dir_out+"list_dept_bigrams_.pck", 'wb') as handle:
        pickle.dump(parl_processed, handle)
    with open(dir_out+"list_dept_trigrams_.pck", 'wb') as handle:
        pickle.dump(parl_tri_processed, handle)

示例#11

0

显示文件

文件： coocurrence_graph.py 项目： lucasant10/Twitter

        with open(dir_in+tw_file) as data_file:
            for line in data_file:
                tweet = json.loads(line)
                doc_list.append(tweet['text'])           
    return doc_list



if __name__=='__main__':

    dir_in= "/Users/lucasso/Documents/pck/"
    dir_ent = "/Users/lucasso/Documents/tweets_pedro/"
    dir_out= "/Users/lucasso/Dropbox/Twitter_Marcelo/Report/plot/"
    doc_list = load_files(dir_ent)
    tp = TextProcessor()
    tweets = tp.text_process(doc_list)
    word_list = set(load_file(dir_out,"word_list.pck"))
    #lista já processada sem entropia 0 e ration >1. remove todas as outras palavras que não interessam dos tweets
    tweets =[[i for i in t if i in word_list] for t in tweets] 
    hashtags = re.compile(r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""")
    hs_set =set()

    hastgs_list = list()
    for tweet in tweets:
        v = ','.join(hashtags.findall( ' '.join(tweet)))
        l = hashtags.findall( ' '.join(tweet))
        hastgs_list.append(l)
        hs_set |= set(v.split(","))

    hastgs_list = [e for e in hastgs_list if e] # remove as listas em branco

示例#12

0

显示文件

文件： confusion_matrix.py 项目： lucasant10/Twitter

    cf = configparser.ConfigParser()
    cf.read("../file_path.properties")
    path = dict(cf.items("file_path"))
    dir_out = path['dir_out']
    dir_ale = path['dir_ale']
    dir_tw = path['dir_tw']

    print("load tweet files")
    fnames = ([file for root, dirs, files in os.walk(dir_tw)
            for file in files if file.endswith('.json')  ])

    categories_tw = list()
    tp = TextProcessor()
    for fl in fnames:
        categories_tw.append(tp.text_process(read_tweets(fl)))

    categories_counter = list()
    test_data = list()
    for categ in categories_tw:
        k = int(len(categ) * 0.2)
        random.shuffle(categ)
        tmp = list(itertools.chain.from_iterable(categ[k:]))
        categories_counter.append(Counter(tmp))
        test_data.append(categ[:k])


    print("process tfidf")
    tfidf_entropy = list()
    tfidf_smooth = list()
    tfidf_like = list()

示例#13

0

显示文件

文件： PLN_tfidf.py 项目： lucasant10/Twitter

plot_cloud(sort_tfidf_like,n,"dic_tfidf_like")

dir_path = "/Users/lucasso/Documents/tweets/"
tp = TextProcessor()
tw_files = ([file for root, dirs, files in os.walk(dir_path)
            for file in files if file.endswith('.json') ])

tw_list = list()
tweets = list()
for tw_file in tw_files:
    with open(dir_path+tw_file) as data_file:
        doc_list = list()
        for line in data_file:
            tweet = json.loads(line)
            doc_list.append(tweet['text'])
    tw_list.append(list(itertools.chain.from_iterable(tp.text_process(doc_list))))

for i in range(len(tw_list)):
    plot_dep_cloud(tw_list[i],sort_tfidf,n,tw_files[i]+"_dic_tfidf")
    plot_dep_cloud(tw_list[i],sort_tf_log_idf,n,tw_files[i]+"dep_dic_tf_log_idf")
    plot_dep_cloud(tw_list[i],sort_tfidf_like,n,tw_files[i]+"dic_tfidf_like")

#Gera o cvs dos rankings
save_ranking(dir_out, sort_tfidf, sort_tf_log_idf, sort_tfidf_like)

#Gera a tabela do tfidf_like e seus parametros
save_tfidf_like(parl_counter, sort_tfidf_like, counter_list, tot_counter,counter_list_dep)
#comando linux para trocar . por ,: tr '.' ',' < arquivo_in > arquivo_out

#Gerar o ranking das palavras e cada parlamanentar
dic_political = dict(dic_tfidf_like)

示例#14

0

显示文件

    VALIDATION_FILE = args.validationfile

    cf = configparser.ConfigParser()
    cf.read("file_path.properties")
    path = dict(cf.items("file_path"))
    dir_in = path['dir_in']

    X, y_true = load_validation_file_csv(VALIDATION_FILE)
    tp = TextProcessor()

    pc = PoliticalClassification(H5_FILE, NPY_FILE, 25)

    pol = ''
    n_pol = ''
    y_pred = list()
    X = tp.text_process(X, text_only=True)
    for tx in X:
        text = ' '.join(tx)
        if pc.is_political(text):
            pol += text + '\n'
            y_pred.append(1)
        else:
            n_pol += text + '\n'
            y_pred.append(0)

    print(classification_report(y_true, y_pred))
    p, r, f1, s = precision_recall_fscore_support(y_true, y_pred)

    ff1 = f1_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    precision = precision_score(y_true, y_pred, average='weighted')

示例#15

0

显示文件

文件： bow_validation.py 项目： lucasant10/election

    dir_w2v = path['dir_w2v']

    print('Loading word2vec model...')
    word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(
        dir_w2v + W2VEC_MODEL_FILE, binary=False, unicode_errors="ignore")

    texts, y_true = load_validation_file_csv(VALIDATION_FILE)

    print('Loading ' + MODEL_FILE + ' file...')

    model = joblib.load(MODEL_FILE)
    pol = ''
    n_pol = ''
    y_pred = list()
    tp = TextProcessor()
    texts = tp.text_process(texts, text_only=True)
    X = gen_data(texts)

    mean_auc, std_auc = generate_roc_curve(
        model, X, y_true, MODEL_FILE, get_model_name_by_file(VALIDATION_FILE))

    print('Predicting...')

    y_pred = model.predict(X)

    print('Classification Report')
    print(classification_report(y_true, y_pred))
    p, r, f1, s = precision_recall_fscore_support(y_true, y_pred)

    model_name = MODEL_FILE.replace(SKL_FOLDER, '')
    model_name = model_name.replace('.politics_ben.skl', '')

示例#16

0

显示文件

文件： text_generator.py 项目： lucasant10/Twitter

        with open(dir_in+tw_file) as data_file:
            for line in data_file:
                tweet = json.loads(line)
                temp.append(tweet['text'])
        doc_list.append(temp)
    return doc_list, tw_files


if __name__ == "__main__":
    
    cf = configparser.ConfigParser()
    cf.read("../file_path.properties")
    path = dict(cf.items("file_path"))
    dir_in = path['dir_val']

    tp = TextProcessor()
    tweets = list()
    doc_list, tw_files = load_files(dir_in)
    for txt in doc_list:
        print(len(doc_list))
        tweets.append(tp.text_process(txt, text_only=True))

    for i, fl in enumerate(tw_files):
        f =  open(dir_in+"%s.txt" % fl.split('.')[0], 'w')
        for tw in tweets[i]:
            f.write(" ".join(tw) + "\n")

        f.close()

示例#17

0

显示文件

文件： graph_lda.py 项目： lucasant10/Twitter

    with open(filedir) as data_file:
        doc_set = list()
        doc_tw = set()
        dc =set()
        weeks = list()
        dist = list()
        lamb = list()
        inicial = 1
        final = 603
        for line in data_file:
            tweet = json.loads(line)
            created = int(tweet['created_at'])
            if(days2time(inicial) <= created < days2time(final)):
                doc_tw.add(tweet['text'])
                doc_set.append(tweet)
        texts = tp.text_process(doc_tw)        
        corpus, dic = tp.create_corpus(texts)
        ldamodel = tp.generate_lda(corpus, dic, 5)
        #ldamodel = tp.generate_hdp(corpus, dic)
        print(tp.print_topics(ldamodel))

        with open(lamb_dir) as l_file:
            for line in l_file:
                i = int(line.split('|')[2])
                w = int(line.split('|')[0])
                lamb.append(w)
                for s in range(i-1): 
                    lamb.append(w)

        for k in range(inicial, final, 7):
            doc = set()

示例#18

0

显示文件

文件： BoWV.py 项目： lucasant10/Twitter

    N_ESTIMATORS = int(args.estimators)
    LOSS_FUN = args.loss
    KERNEL = args.kernel
    
    print('Word2Vec embedding: %s' %(W2VEC_MODEL_FILE))
    print('Embedding Dimension: %d' %(EMBEDDING_DIM))
    
    cf = configparser.ConfigParser()
    cf.read("../file_path.properties")
    path = dict(cf.items("file_path"))
    dir_w2v = path['dir_w2v']
    dir_in = path['dir_in']

    word2vec_model = gensim.models.Word2Vec.load(dir_w2v+W2VEC_MODEL_FILE)
    tp = TextProcessor()
    doc_list, tw_class = load_files(dir_in)
    tweets = tp.text_process(doc_list, text_only=True)
    tweets = select_tweets(tweets)

    X, Y = gen_data(tweets, tw_class)

    model = classification_model(X, Y, MODEL_TYPE)
    joblib.dump(model, dir_in + MODEL_TYPE + '.skl')


    # python BoWV.py --model logistic --seed 42 -f model_word2vec -d 100 --folds 10
    # python BoWV.py --model gradient_boosting --seed 42 -f model_word2vec -d 100 --loss deviance --folds 10
    # python BoWV.py --model random_forest --seed 42 -f model_word2vec -d 100 --estimators 20 --folds 10
    # python BoWV.py --model svm_linear --seed 42 -f model_word2vec -d 100 --loss squared_hinge --folds 10
    # python BoWV.py --model svm --seed 42 -f model_word2vec -d 100 --kernel rbf --folds 10

示例#19

0

显示文件

文件： bi_trigram_process.py 项目： lucasant10/Twitter

if __name__=='__main__':

cf = configparser.ConfigParser()
cf.read("file_path.properties")
path = dict(cf.items("file_path"))
dir_in = path['dir_in']
dir_out = path['dir_out']
dir_ale = path['dir_ale']
dir_pck = path['dir_pck']


doc_list, parl_tw_list = load_files(dir_in)
_ ,list_aleatory = load_files(dir_ale)

tp = TextProcessor()
tweets = tp.text_process(doc_list, text_only=True)

parl_tw_processed = list()
for l in parl_tw_list:
    parl_tw_processed.append(tp.text_process(l, text_only=True))

alea_tw_processed = list()
for l in list_aleatory:
    alea_tw_processed.append(tp.text_process(l, text_only=True))

for i,l in enumerate(alea_tw_processed):
    alea_tw_processed[i] = [n for n in l if n]

with open(dir_out+"bgr_tfidf_like.pck",'rb') as handle:
    parl_bigrams = pickle.load(handle)