Exemplo n.º 1
0
def extract_text_for_BTM_topic_distribution(source, output_file):
    tweets = []
    stoplist = load_stoplist()
    wordnet_lemmatizer = WordNetLemmatizer()
    df = pd.read_csv(source, encoding='utf-8')
    for index, row in df.iterrows():
        tweet = {}
        clean_text = common.cleanhtml(
            common.remove_hashtag_sign(
                common.remove_username(
                    common.remove_url(ftfy.fix_text(row['text'])))))
        preprocessed_text = ''
        temp = [
            wordnet_lemmatizer.lemmatize(word.lower())
            for word in nltk.regexp_tokenize(clean_text, pattern)
            if wordnet_lemmatizer.lemmatize(word.lower()) not in stoplist
        ]
        if len(temp) == 0:
            continue
        for word in temp:
            preprocessed_text += word + ' '
        date = datetime.strptime(
            row['created_at'],
            '%a %b %d %H:%M:%S +0000 %Y').replace(tzinfo=pytz.UTC)
        y_m = str(date.year) + '-' + (str(date.month) if
                                      (len(str(date.month)) == 2) else
                                      ('0' + str(date.month)))
        tweet['clean_text'] = clean_text
        tweet['us_state'] = row['us_state']
        tweet['preprocessed_text'] = preprocessed_text
        tweet['date'] = y_m
        tweets.append(tweet)
    logger.info(len(tweets))
    to_csv(tweets, output_file)
Exemplo n.º 2
0
def extract_hashtag(source):
    hashtag = {}
    cnt = 0
    df = pd.read_csv(source, encoding='utf-8')
    for index, row in df.iterrows():
        if '#' in row['text']:
            text = common.cleanhtml(
                common.remove_username(
                    common.remove_url(ftfy.fix_text(row['text']))))
            hastags = re.findall(r"#(\w+)", text)
            if len(hastags) != 0:
                for tag in hastags:
                    if tag.lower() not in hashtag:
                        hashtag[tag.lower()] = 1
                    else:
                        hashtag[tag.lower()] += 1
    print(hashtag)
    with open('./intermediate_data/hastags.json', 'w') as outfile:
        json.dump(hashtag, outfile)
Exemplo n.º 3
0
def extract_tweet_not_by_uid(source):
    tweets = []
    BTM_input = []
    wordnet_lemmatizer = WordNetLemmatizer()
    df = pd.read_csv(source, encoding='utf-8')
    for index, row in df.iterrows():
        clean_text = common.cleanhtml(
            common.remove_hashtag_sign(
                common.remove_username(
                    common.remove_url(ftfy.fix_text(row['text'])))))
        preprocessed_text = ''
        temp = [
            wordnet_lemmatizer.lemmatize(word.lower())
            for word in nltk.regexp_tokenize(clean_text, pattern)
        ]
        if len(temp) == 0:
            continue
        for word in temp:
            preprocessed_text += word + ' '
        tweets.append(row['text'])
        BTM_input.append(preprocessed_text)

    with open('./intermediate_data/hpv_tweets/hpv_tweets_not_by_uid.csv',
              'a',
              newline='',
              encoding='utf-8') as csv_f:
        writer = csv.DictWriter(csv_f,
                                fieldnames=['tweets'],
                                delimiter=',',
                                quoting=csv.QUOTE_ALL)
        writer.writeheader()
        for tweet in tweets:
            writer.writerow({'tweets': tweet})
    with open(
            './intermediate_data/hpv_tweets/hpv_tweets_not_by_uid_BTM_input.txt',
            'w',
            encoding='utf-8') as outfile:
        for tweet in BTM_input:
            outfile.write(tweet + '\n')
Exemplo n.º 4
0
def extract_clean_text(json_file):
    wv = []
    cnt = 0
    stoplist = load_stoplist()
    wordnet_lemmatizer = WordNetLemmatizer()
    with open(json_file, 'r') as json_file:
        user_tweets = json.load(json_file)
        for user in user_tweets:
            text = ''
            for tweet in user_tweets[user]:
                text += common.cleanhtml(
                    common.remove_hashtag_sign(
                        common.remove_username(
                            common.remove_url(ftfy.fix_text(tweet))))) + ' '
            # clean_texts = [wordnet_lemmatizer.lemmatize(word.lower()) for word in nltk.regexp_tokenize(text, pattern)]
            clean_texts = [
                wordnet_lemmatizer.lemmatize(word.lower())
                for word in nltk.regexp_tokenize(text, pattern)
                if wordnet_lemmatizer.lemmatize(word.lower()) not in stoplist
            ]
            wv.append(clean_texts)
            cnt += 1
    logger.info('total tweets: %d;' % cnt)
    return wv
Exemplo n.º 5
0
def generate_corpus_for_quality_evaluation(k, pz_d, tweets,
                                           topic_words_distribution):
    all_tweets = []
    logger.info(k)
    df = pd.read_csv(tweets, encoding='utf-8')
    for index, row in df.iterrows():
        all_tweets.append(row['tweets'])

    tweets_pz_d = []
    with open(pz_d) as f:
        for l in f:
            line = l.strip().split(' ')
            tweets_pz_d.append([float(p) for p in line])

    results = {}
    for j in range(len(tweets_pz_d)):
        if 'nan' not in tweets_pz_d[j] and '-nan' not in tweets_pz_d[j]:
            sorted_pz_ds = list(tweets_pz_d[j])
            sorted_pz_ds.sort(reverse=True)
            topic_id = tweets_pz_d[j].index(sorted_pz_ds[0])
            if topic_id not in results:
                results[topic_id] = [all_tweets[j]]
            else:
                results[topic_id].append(all_tweets[j])

    final_result = []
    for tp in results:
        for keyword in topic_words_distribution[tp][1]:
            temp = []
            dedup = set()
            for tweet in results[tp]:
                if '%s' % keyword[0] in tweet.lower():
                    clean_text_list = (common.cleanhtml(
                        common.remove_username(
                            common.remove_url(ftfy.fix_text(
                                tweet.lower()))))).strip(' ').replace(
                                    '\n', ' ').split(' ')[:-1]
                    clean_text = ",".join(str(x) for x in clean_text_list)
                    if clean_text not in dedup:
                        temp.append(tweet)
                        dedup.add(clean_text)

            # samples_number = random.sample(range(1, len(temp)), 1)
            # if (tp == 6) and (keyword[0] == 'u.s.'):
            #     logger.info(temp)
            #     quit()

            samples_number = []
            if len(temp) <= 2:
                samples_number = range(len(temp))
            else:
                samples_number = random.sample(range(1, len(temp)), 2)
            for i in samples_number:
                result = {}
                result['topic_id'] = tp
                result['keyword'] = keyword[0]
                result['propability'] = keyword[1]
                result['tweet'] = temp[i]
                final_result.append(result)

    to_csv(
        final_result,
        '../../papers/2017_BMC_HPV/analysis/BTM/quality_evaluation/' + str(k) +
        'tp.csv')
Exemplo n.º 6
0
def group_tweets_by_cluster_gold_standard(source, k):
    tweets = []
    all_tweets_in_cluster = []
    wordnet_lemmatizer = WordNetLemmatizer()

    # read all tweets
    df = pd.read_csv(source, encoding='utf-8')
    for index, row in df.iterrows():
        tweets.append(row['text'])

    hashtags = []
    with open('./intermediate_data/cluster_hashtags.json', 'r') as json_file:
        hashtags = json.load(json_file)
    for i in range(k):
        # with open('./BTM/output/' + str(k) + 'tp/clusters/' + str(i) + 'tp.txt', 'w') as clusters:
        with open('./intermediate_data/LDA_BTM_comparison/clusters/' + str(i) +
                  'tp.txt',
                  'w',
                  encoding="utf-8") as clusters:
            print(i)
            for tweet in tweets:
                tags = re.findall(r"#(\w+)", tweet)
                if len(tags) != 0:
                    for tag in tags:
                        if tag.lower() in hashtags[i]:
                            text = common.cleanhtml(
                                common.remove_hashtag_sign(
                                    common.remove_username(
                                        common.remove_url(
                                            ftfy.fix_text(tweet)))))
                            clean_texts = [
                                wordnet_lemmatizer.lemmatize(word.lower())
                                for word in nltk.regexp_tokenize(
                                    text, pattern)
                            ]
                            final_text = ''
                            for word in clean_texts:
                                final_text += word + ' '
                            all_tweets_in_cluster.append(final_text)
                            # final_text = re.sub(r"[\u4e00-\u9fff]", "", final_text)
                            clusters.write(final_text)
                            clusters.write('\n')
                            break

    # txt for BTM
    with open(
            './intermediate_data/LDA_BTM_comparison/lda_BTM_comparison_traning_data.txt',
            'w',
            encoding="utf-8") as file:
        for tweet in all_tweets_in_cluster:
            file.write(tweet)
            file.write('\n')

    # csv for LDA
    fieldnames = ['clean_text']
    with open(
            './intermediate_data/LDA_BTM_comparison/lda_BTM_comparison_traning_data.csv',
            'w',
            newline='',
            encoding='utf-8') as csv_f:
        writer = csv.DictWriter(csv_f,
                                fieldnames=fieldnames,
                                delimiter=',',
                                quoting=csv.QUOTE_ALL)
        writer.writeheader()
        for tweet in all_tweets_in_cluster:
            writer.writerow({'clean_text': tweet})
Exemplo n.º 7
0
def load_data(csv_file, text_fields = []):

    data = pd.read_csv(csv_file, encoding='utf-8')
    
    documents = []
    for i, r in data.iterrows():

        document = ''
        for text_field in text_fields:
            #logger.info(pd.isnull(r[text_field]))
            if(pd.notnull(r[text_field])):
                document = '%s  %s'%(document, common.cleanhtml(common.remove_hashtag_sign(common.remove_username(common.remove_url(ftfy.fix_text(r[text_field]))))))
                # document = '%s  %s'%(document, r[text_field])

        documents.append(document)

    logger.info("# of documents: %d"%len(documents))

    stoplist = load_stoplist()
    # logging.info(stoplist)
    # quit()
    wordnet_lemmatizer = WordNetLemmatizer()

    texts = [[wordnet_lemmatizer.lemmatize(word.lower()) for word in nltk.regexp_tokenize(document, pattern) if wordnet_lemmatizer.lemmatize(word.lower()) not in stoplist] for document in documents]


    # # quit()

    texts = filter_by_frequency(texts)

    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    # logger.info(corpus[0])
    return dictionary, corpus