def mergecolumns(file1, file2, outname): f = loadlines(file1) h = loadlines(file2) g = codecs.open(outname,'w',encoding='utf-8') gc = csv.writer(g,quoting=csv.QUOTE_NONE, escapechar=' ', quotechar=' ',delimiter='\t') lines = [] # f and g should have the same number of lines for i in list(range(0,len(f))): lines.append([f[i],h[i]]) gc.writerows(lines) g.close()
def cleanandscore_emoji_sentiwords(inname, outname, emdicname, sentiworddicname, saturationfunction): f = open(inname, 'r') # explicit output encoding assignment is needed, otherwise Python will choose system's default encoding (Latin-1 for Windows) g = codecs.open(outname, 'w', encoding='utf-8') gc = csv.writer(g, delimiter='\t') cnt = [] # preserve all lines (lines containing neither emoticons nor sentiwords will have zero score) for line in f.readlines(): cnt.append(line.lower()) f.close() # strip duplicate lines cnt1 = list(set(cnt)) cnt = [] # strip retweets, hashtags and mentions for line in cnt1: if rtw.search(line) is None: cnt.append(line) # remove mentions and hastags, and URLs cnt1 = cnt cnt = [] for line in cnt1: cnt.append(tco.sub(' ', htg.sub(' ', mnt.sub(' ', line)))) ### append emoticon scores ### lines = cnt cnt = None ed = importemoticonscoredict(emdicname) # contains integer scores sd = importsentixscoredict( sentiworddicname) # contains averaged floating point scores for i in list(range(0, len(lines))): items = utf16emojis1.findall(lines[i]) + utf16emojis2.findall(lines[i]) score = 0 # accumulate score for emojis in the score dictionary for em in items: if em in ed.keys(): score = score + int(ed[em]) # remove all emojis in the non-BMP UTF-16 block and append score lines[i] = lowlevelclean( re.sub('[\r\n\t]', ' ', utf16emojis2.sub(' ', utf16emojis1.sub(' ', lines[i])))) # after cleaning, add sentiword contributes to accumulated score items = lines[i].split() for w in items: if w in sd.keys(): score = score + float(sd[w]) ### saturation to be done here, or in the saturate method (or even better, set number of classes as input parameter) ### lines[i] = [lines[i], str(saturationfunction(score))] # write to csv file gc.writerows(lines) g.close()
def mergecolumns(file1, file2, outname): f = loadlines(file1) h = loadlines(file2) g = codecs.open(outname, 'w', encoding='utf-8') gc = csv.writer(g, quoting=csv.QUOTE_NONE, escapechar=' ', quotechar=' ', delimiter='\t') lines = [] # f and g should have the same number of lines for i in list(range(0, len(f))): lines.append([f[i], h[i]]) gc.writerows(lines) g.close()
def cleanandscore(inname, outname, dicname): f = open(inname, 'r') # explicit output encoding assignment is needed, otherwise Python will choose system's default encoding (Latin-1 for Windows) g = codecs.open(outname, 'w', encoding='utf-8') gc = csv.writer(g, delimiter='\t') # preserve lines with UTF-16 emoticons cnt = [] for line in f.readlines(): if utf16emojis1.search(line) is not None or utf16emojis2.search( line) is not None: cnt.append(line) f.close() # strip duplicate lines cnt1 = list(set(cnt)) cnt = [] # strip retweets, hashtags and mentions for line in cnt1: if rtw.search(line) is None: cnt.append(line) # remove mentions and hastags, and URLs cnt1 = cnt cnt = [] for line in cnt1: cnt.append(tco.sub(' ', htg.sub(' ', mnt.sub(' ', line)))) # append emoticon scores lines = cnt cnt = None dic = importemoticonscoredict(dicname) keys = dic.keys() for i in list(range(0, len(lines))): items = utf16emojis1.findall(lines[i]) + utf16emojis2.findall(lines[i]) score = 0 # accumulate score for emojis in the score dictionary for em in items: if em in keys: score = score + int(dic[em]) # remove all emojis in the non-BMP UTF-16 block and append score lines[i] = [ re.sub('[\r\n\t]', ' ', utf16emojis2.sub(' ', utf16emojis1.sub(' ', lines[i]))), str(score) ] # low-level cleaning and line feed restore lines[i][0] = lowlevelclean(lines[i][0]) # write to csv file gc.writerows(lines) g.close()
def cleanandscore_emoji_sentiwords(inname,outname,emdicname,sentiworddicname,saturationfunction): f = open(inname,'r') # explicit output encoding assignment is needed, otherwise Python will choose system's default encoding (Latin-1 for Windows) g = codecs.open(outname,'w',encoding='utf-8') gc = csv.writer(g,delimiter='\t') cnt=[] # preserve all lines (lines containing neither emoticons nor sentiwords will have zero score) for line in f.readlines(): cnt.append(line.lower()) f.close() # strip duplicate lines cnt1 = list(set(cnt)) cnt = [] # strip retweets, hashtags and mentions for line in cnt1: if rtw.search(line) is None: cnt.append(line) # remove mentions and hastags, and URLs cnt1 = cnt cnt = [] for line in cnt1: cnt.append(tco.sub(' ',htg.sub(' ',mnt.sub(' ',line)))) ### append emoticon scores ### lines = cnt; cnt = None; ed=importemoticonscoredict(emdicname) # contains integer scores sd=importsentixscoredict(sentiworddicname) # contains averaged floating point scores for i in list(range(0,len(lines))): items = utf16emojis1.findall(lines[i]) + utf16emojis2.findall(lines[i]) score=0 # accumulate score for emojis in the score dictionary for em in items: if em in ed.keys(): score=score+int(ed[em]) # remove all emojis in the non-BMP UTF-16 block and append score lines[i] = lowlevelclean(re.sub('[\r\n\t]',' ',utf16emojis2.sub(' ',utf16emojis1.sub(' ',lines[i])))) # after cleaning, add sentiword contributes to accumulated score items = lines[i].split() for w in items: if w in sd.keys(): score=score+float(sd[w]) ### saturation to be done here, or in the saturate method (or even better, set number of classes as input parameter) ### lines[i] = [lines[i], str(saturationfunction(score))] # write to csv file gc.writerows(lines) g.close()
def cleanandscore(inname,outname,dicname): f = open(inname,'r') # explicit output encoding assignment is needed, otherwise Python will choose system's default encoding (Latin-1 for Windows) g = codecs.open(outname,'w',encoding='utf-8') gc = csv.writer(g,delimiter='\t') # preserve lines with UTF-16 emoticons cnt=[] for line in f.readlines(): if utf16emojis1.search(line) is not None or utf16emojis2.search(line) is not None: cnt.append(line) f.close() # strip duplicate lines cnt1 = list(set(cnt)) cnt = [] # strip retweets, hashtags and mentions for line in cnt1: if rtw.search(line) is None: cnt.append(line) # remove mentions and hastags, and URLs cnt1 = cnt cnt = [] for line in cnt1: cnt.append(tco.sub(' ',htg.sub(' ',mnt.sub(' ',line)))) # append emoticon scores lines = cnt; cnt = None; dic=importemoticonscoredict(dicname) keys = dic.keys() for i in list(range(0,len(lines))): items = utf16emojis1.findall(lines[i]) + utf16emojis2.findall(lines[i]) score=0 # accumulate score for emojis in the score dictionary for em in items: if em in keys: score=score+int(dic[em]) # remove all emojis in the non-BMP UTF-16 block and append score lines[i] = [re.sub('[\r\n\t]',' ',utf16emojis2.sub(' ',utf16emojis1.sub(' ',lines[i]))) , str(score)] # low-level cleaning and line feed restore lines[i][0] = lowlevelclean(lines[i][0]) # write to csv file gc.writerows(lines) g.close()