def dataReader(in_file_path, n_grams, tid_dict, sc_dict): with open(in_file_path, 'rb') as tsv_in: #create csv reader for the input file tsv_in = csv.reader(tsv_in, delimiter='\t') # File format # TweetID | Tweet | class (sentiment) for row in tsv_in: #print row tid = row[0] tweet = row[1] if len(row) > 2: sentiment = row[2] # only consider unique tweets if not tid in tid_dict: if sentiment in sc_dict: tid_dict[tid] = sentiment sc_dict[sentiment] += 1 # obtaining individual words from the tweet # Special characters, numbers are removed and converted to lower case # first step, removing special characters other than ones related to smileys words = dataCleaner_withStem.removeStopWords( dataCleaner_withStem.removeSpecialCharacters( tweet), 1) for token in words: if token in n_grams: n_grams[token] += 1 else: n_grams[token] = 1
def createFV(in_file_path, selected_n_grams, tdm_dict, tid_dict, sc_dict): with open(in_file_path, 'rb') as tsv_in: # Initializing the TDM (list of dictionaries) for tid in tid_dict: tdm_dict[tid] = {} for topic in selected_n_grams: for token in selected_n_grams[topic]: tdm_dict[tid][token] = 0 #create csv reader for the input file # TweetID | Topic | Tweet | class (sentiment) tsv_in = csv.reader(tsv_in, delimiter='\t') for row in tsv_in: tid = row[0] # topic = row[1] tweet = row[2] if len(row)>3: words = dataCleaner_withStem.removeStopWords(dataCleaner_withStem.removeSpecialCharacters(dataCleaner_withStem.convertEmoticons(tweet)),1) #topic = dataCleaner.removeSpecialCharacters(topic) #words = dataCleaner.removeStopWords(dataCleaner.removeSpecialCharacters(tweet), 1) if tid in tid_dict: for topic in selected_n_grams: for token in selected_n_grams[topic]: if token in words: tdm_dict[tid][token]+=1
def createFV(in_file_path, selected_n_grams, tdm_dict, tid_dict, sc_dict): with open(in_file_path, 'rb') as tsv_in: # Initializing the TDM (list of dictionaries) for tid in tid_dict: tdm_dict[tid] = {} for topic in selected_n_grams: for token in selected_n_grams[topic]: tdm_dict[tid][token] = 0 #create csv reader for the input file # TweetID | Topic | Tweet | class (sentiment) tsv_in = csv.reader(tsv_in, delimiter='\t') for row in tsv_in: tid = row[0] # topic = row[1] tweet = row[2] if len(row) > 3: words = dataCleaner_withStem.removeStopWords( dataCleaner_withStem.removeSpecialCharacters( dataCleaner_withStem.convertEmoticons(tweet)), 1) #topic = dataCleaner.removeSpecialCharacters(topic) #words = dataCleaner.removeStopWords(dataCleaner.removeSpecialCharacters(tweet), 1) if tid in tid_dict: for topic in selected_n_grams: for token in selected_n_grams[topic]: if token in words: tdm_dict[tid][token] += 1
def dataReader(in_file_path, n_grams, tid_dict, sc_dict): with open(in_file_path, "rb") as tsv_in: # create csv reader for the input file tsv_in = csv.reader(tsv_in, delimiter="\t") # File format # TweetID | Tweet | class (sentiment) for row in tsv_in: # print row tid = row[0] tweet = row[1] if len(row) > 2: sentiment = row[2] # only consider unique tweets if not tid in tid_dict: if sentiment in sc_dict: tid_dict[tid] = sentiment sc_dict[sentiment] += 1 # obtaining individual words from the tweet # Special characters, numbers are removed and converted to lower case # first step, removing special characters other than ones related to smileys words = dataCleaner_withStem.removeStopWords( dataCleaner_withStem.removeSpecialCharacters(tweet), 1 ) for token in words: if token in n_grams: n_grams[token] += 1 else: n_grams[token] = 1
def createFV(in_file_path, selected_n_grams, tdm_dict, tid_dict, sc_dict): with open(in_file_path, 'rb') as tsv_in: # Initializing the TDM (list of dictionaries) for tid in tid_dict: tdm_dict[tid] = selected_n_grams tdm_dict[tid] = {x:0 for (x,v) in tdm_dict[tid].items()} #create csv reader for the input file tsv_in = csv.reader(tsv_in, delimiter='\t') for row in tsv_in: tid = row[0] tweet = row[1] if len(row)>2: words = dataCleaner_withStem.removeStopWords(dataCleaner_withStem.removeSpecialCharacters(dataCleaner_withStem.convertEmoticons(tweet)),1) #words = dataCleaner.removeStopWords(dataCleaner.removeSpecialCharacters(tweet), 1) if tid in tid_dict: for token in selected_n_grams: if token in words: tdm_dict[tid][token]+=1
def createFV(in_file_path, selected_n_grams, tdm_dict, tid_dict, sc_dict): with open(in_file_path, "rb") as tsv_in: # Initializing the TDM (list of dictionaries) for tid in tid_dict: tdm_dict[tid] = selected_n_grams tdm_dict[tid] = {x: 0 for (x, v) in tdm_dict[tid].items()} # create csv reader for the input file tsv_in = csv.reader(tsv_in, delimiter="\t") for row in tsv_in: tid = row[0] tweet = row[1] if len(row) > 2: words = dataCleaner_withStem.removeStopWords(dataCleaner_withStem.removeSpecialCharacters(tweet), 1) # words = dataCleaner.removeStopWords(dataCleaner.removeSpecialCharacters(tweet), 1) if tid in tid_dict: for token in selected_n_grams: if token in words: tdm_dict[tid][token] += 1