Python removeStopWords示例，dataCleaner_withStem.removeStopWords Python示例

示例#1

0

显示文件

文件： TDM_TA_stem.py 项目： meshiguge/twitter-sentiment-analysis

def dataReader(in_file_path, n_grams, tid_dict, sc_dict):
    with open(in_file_path, 'rb') as tsv_in:
        #create csv reader for the input file
        tsv_in = csv.reader(tsv_in, delimiter='\t')

        # File format
        # TweetID | Tweet | class (sentiment)
        for row in tsv_in:
            #print row
            tid = row[0]
            tweet = row[1]
            if len(row) > 2:
                sentiment = row[2]
                # only consider unique tweets
                if not tid in tid_dict:
                    if sentiment in sc_dict:
                        tid_dict[tid] = sentiment
                        sc_dict[sentiment] += 1
                        # obtaining individual words from the tweet
                        # Special characters, numbers are removed and converted to lower case
                        # first step, removing special characters other than ones related to smileys

                        words = dataCleaner_withStem.removeStopWords(
                            dataCleaner_withStem.removeSpecialCharacters(
                                tweet), 1)
                        for token in words:
                            if token in n_grams:
                                n_grams[token] += 1
                            else:
                                n_grams[token] = 1

示例#2

0

显示文件

文件： TDM_TB_emot_stem_tfidf.py 项目： gerardBriones/twitter-sentiment-analysis

def createFV(in_file_path, selected_n_grams, tdm_dict, tid_dict, sc_dict):
    with open(in_file_path, 'rb') as tsv_in:
        # Initializing the TDM (list of dictionaries)
        for tid in tid_dict:
            tdm_dict[tid] = {}
            for topic in selected_n_grams:
                for token in selected_n_grams[topic]:
                    tdm_dict[tid][token] = 0
        
        #create csv reader for the input file
        # TweetID | Topic | Tweet | class (sentiment)
        tsv_in = csv.reader(tsv_in, delimiter='\t')
        for row in tsv_in:
            tid = row[0]
           # topic = row[1]
            tweet = row[2]
            if len(row)>3:
                words = dataCleaner_withStem.removeStopWords(dataCleaner_withStem.removeSpecialCharacters(dataCleaner_withStem.convertEmoticons(tweet)),1)
                #topic = dataCleaner.removeSpecialCharacters(topic)
                
                #words = dataCleaner.removeStopWords(dataCleaner.removeSpecialCharacters(tweet), 1)
                if tid in tid_dict:
                    for topic in selected_n_grams:
                        for token in selected_n_grams[topic]:
                            if token in words:                                
                                    tdm_dict[tid][token]+=1

示例#3

0

显示文件

def createFV(in_file_path, selected_n_grams, tdm_dict, tid_dict, sc_dict):
    with open(in_file_path, 'rb') as tsv_in:
        # Initializing the TDM (list of dictionaries)
        for tid in tid_dict:
            tdm_dict[tid] = {}
            for topic in selected_n_grams:
                for token in selected_n_grams[topic]:
                    tdm_dict[tid][token] = 0

        #create csv reader for the input file
        # TweetID | Topic | Tweet | class (sentiment)
        tsv_in = csv.reader(tsv_in, delimiter='\t')
        for row in tsv_in:
            tid = row[0]
            # topic = row[1]
            tweet = row[2]
            if len(row) > 3:
                words = dataCleaner_withStem.removeStopWords(
                    dataCleaner_withStem.removeSpecialCharacters(
                        dataCleaner_withStem.convertEmoticons(tweet)), 1)
                #topic = dataCleaner.removeSpecialCharacters(topic)

                #words = dataCleaner.removeStopWords(dataCleaner.removeSpecialCharacters(tweet), 1)
                if tid in tid_dict:
                    for topic in selected_n_grams:
                        for token in selected_n_grams[topic]:
                            if token in words:
                                tdm_dict[tid][token] += 1

示例#4

0

显示文件

文件： TDM_TA_stem.py 项目： gerardBriones/twitter-sentiment-analysis

def dataReader(in_file_path, n_grams, tid_dict, sc_dict):
    with open(in_file_path, "rb") as tsv_in:
        # create csv reader for the input file
        tsv_in = csv.reader(tsv_in, delimiter="\t")

        # File format
        # TweetID | Tweet | class (sentiment)
        for row in tsv_in:
            # print row
            tid = row[0]
            tweet = row[1]
            if len(row) > 2:
                sentiment = row[2]
                # only consider unique tweets
                if not tid in tid_dict:
                    if sentiment in sc_dict:
                        tid_dict[tid] = sentiment
                        sc_dict[sentiment] += 1
                        # obtaining individual words from the tweet
                        # Special characters, numbers are removed and converted to lower case
                        # first step, removing special characters other than ones related to smileys

                        words = dataCleaner_withStem.removeStopWords(
                            dataCleaner_withStem.removeSpecialCharacters(tweet), 1
                        )
                        for token in words:
                            if token in n_grams:
                                n_grams[token] += 1
                            else:
                                n_grams[token] = 1

示例#5

0

显示文件

文件： TDM_TA_emot_stem.py 项目： gerardBriones/twitter-sentiment-analysis

def createFV(in_file_path, selected_n_grams, tdm_dict, tid_dict, sc_dict):
    with open(in_file_path, 'rb') as tsv_in:
        # Initializing the TDM (list of dictionaries)
        for tid in tid_dict:
            tdm_dict[tid] = selected_n_grams
            tdm_dict[tid] = {x:0 for (x,v)  in tdm_dict[tid].items()}
        
        #create csv reader for the input file
        tsv_in = csv.reader(tsv_in, delimiter='\t')
        for row in tsv_in:
            tid = row[0]
            tweet = row[1]
            if len(row)>2:
                words = dataCleaner_withStem.removeStopWords(dataCleaner_withStem.removeSpecialCharacters(dataCleaner_withStem.convertEmoticons(tweet)),1)
                #words = dataCleaner.removeStopWords(dataCleaner.removeSpecialCharacters(tweet), 1)
                if tid in tid_dict:
                    for token in selected_n_grams:
                        if token in words:
                            tdm_dict[tid][token]+=1

示例#6

0

显示文件

文件： TDM_TA_stem.py 项目： gerardBriones/twitter-sentiment-analysis

def createFV(in_file_path, selected_n_grams, tdm_dict, tid_dict, sc_dict):
    with open(in_file_path, "rb") as tsv_in:
        # Initializing the TDM (list of dictionaries)
        for tid in tid_dict:
            tdm_dict[tid] = selected_n_grams
            tdm_dict[tid] = {x: 0 for (x, v) in tdm_dict[tid].items()}

        # create csv reader for the input file
        tsv_in = csv.reader(tsv_in, delimiter="\t")
        for row in tsv_in:
            tid = row[0]
            tweet = row[1]
            if len(row) > 2:
                words = dataCleaner_withStem.removeStopWords(dataCleaner_withStem.removeSpecialCharacters(tweet), 1)
                # words = dataCleaner.removeStopWords(dataCleaner.removeSpecialCharacters(tweet), 1)
                if tid in tid_dict:
                    for token in selected_n_grams:
                        if token in words:
                            tdm_dict[tid][token] += 1