def mergecolumns(file1, file2, outname):
  f = loadlines(file1)
  h = loadlines(file2)
  g = codecs.open(outname,'w',encoding='utf-8')
  gc = csv.writer(g,quoting=csv.QUOTE_NONE, escapechar=' ', quotechar=' ',delimiter='\t')
  lines = []
  # f and g should have the same number of lines
  for i in list(range(0,len(f))):
    lines.append([f[i],h[i]])
  gc.writerows(lines)
  g.close()
Exemplo n.º 2
0
def cleanandscore_emoji_sentiwords(inname, outname, emdicname,
                                   sentiworddicname, saturationfunction):
    f = open(inname, 'r')
    # explicit output encoding assignment is needed, otherwise Python will choose system's default encoding (Latin-1 for Windows)
    g = codecs.open(outname, 'w', encoding='utf-8')
    gc = csv.writer(g, delimiter='\t')
    cnt = []
    # preserve all lines (lines containing neither emoticons nor sentiwords will have zero score)
    for line in f.readlines():
        cnt.append(line.lower())
    f.close()
    # strip duplicate lines
    cnt1 = list(set(cnt))
    cnt = []
    # strip retweets, hashtags and mentions
    for line in cnt1:
        if rtw.search(line) is None:
            cnt.append(line)
    # remove mentions and hastags, and URLs
    cnt1 = cnt
    cnt = []
    for line in cnt1:
        cnt.append(tco.sub(' ', htg.sub(' ', mnt.sub(' ', line))))
    ### append emoticon scores ###
    lines = cnt
    cnt = None
    ed = importemoticonscoredict(emdicname)  # contains integer scores
    sd = importsentixscoredict(
        sentiworddicname)  # contains averaged floating point scores
    for i in list(range(0, len(lines))):
        items = utf16emojis1.findall(lines[i]) + utf16emojis2.findall(lines[i])
        score = 0
        # accumulate score for emojis in the score dictionary
        for em in items:
            if em in ed.keys():
                score = score + int(ed[em])
        # remove all emojis in the non-BMP UTF-16 block and append score
        lines[i] = lowlevelclean(
            re.sub('[\r\n\t]', ' ',
                   utf16emojis2.sub(' ', utf16emojis1.sub(' ', lines[i]))))
        # after cleaning, add sentiword contributes to accumulated score
        items = lines[i].split()
        for w in items:
            if w in sd.keys():
                score = score + float(sd[w])
        ### saturation to be done here, or in the saturate method (or even better, set number of classes as input parameter) ###
        lines[i] = [lines[i], str(saturationfunction(score))]
    # write to csv file
    gc.writerows(lines)
    g.close()
Exemplo n.º 3
0
def mergecolumns(file1, file2, outname):
    f = loadlines(file1)
    h = loadlines(file2)
    g = codecs.open(outname, 'w', encoding='utf-8')
    gc = csv.writer(g,
                    quoting=csv.QUOTE_NONE,
                    escapechar=' ',
                    quotechar=' ',
                    delimiter='\t')
    lines = []
    # f and g should have the same number of lines
    for i in list(range(0, len(f))):
        lines.append([f[i], h[i]])
    gc.writerows(lines)
    g.close()
Exemplo n.º 4
0
def cleanandscore(inname, outname, dicname):
    f = open(inname, 'r')
    # explicit output encoding assignment is needed, otherwise Python will choose system's default encoding (Latin-1 for Windows)
    g = codecs.open(outname, 'w', encoding='utf-8')
    gc = csv.writer(g, delimiter='\t')

    # preserve lines with UTF-16 emoticons
    cnt = []
    for line in f.readlines():
        if utf16emojis1.search(line) is not None or utf16emojis2.search(
                line) is not None:
            cnt.append(line)
    f.close()
    # strip duplicate lines
    cnt1 = list(set(cnt))
    cnt = []
    # strip retweets, hashtags and mentions
    for line in cnt1:
        if rtw.search(line) is None:
            cnt.append(line)
    # remove mentions and hastags, and URLs
    cnt1 = cnt
    cnt = []
    for line in cnt1:
        cnt.append(tco.sub(' ', htg.sub(' ', mnt.sub(' ', line))))
    # append emoticon scores
    lines = cnt
    cnt = None
    dic = importemoticonscoredict(dicname)
    keys = dic.keys()
    for i in list(range(0, len(lines))):
        items = utf16emojis1.findall(lines[i]) + utf16emojis2.findall(lines[i])
        score = 0
        # accumulate score for emojis in the score dictionary
        for em in items:
            if em in keys:
                score = score + int(dic[em])
        # remove all emojis in the non-BMP UTF-16 block and append score
        lines[i] = [
            re.sub('[\r\n\t]', ' ',
                   utf16emojis2.sub(' ', utf16emojis1.sub(' ', lines[i]))),
            str(score)
        ]
        # low-level cleaning and line feed restore
        lines[i][0] = lowlevelclean(lines[i][0])
    # write to csv file
    gc.writerows(lines)
    g.close()
def cleanandscore_emoji_sentiwords(inname,outname,emdicname,sentiworddicname,saturationfunction):
  f = open(inname,'r')
  # explicit output encoding assignment is needed, otherwise Python will choose system's default encoding (Latin-1 for Windows)
  g = codecs.open(outname,'w',encoding='utf-8')
  gc = csv.writer(g,delimiter='\t')
  cnt=[]
  # preserve all lines (lines containing neither emoticons nor sentiwords will have zero score)
  for line in f.readlines():
    cnt.append(line.lower())
  f.close()
  # strip duplicate lines
  cnt1 = list(set(cnt))
  cnt = []
  # strip retweets, hashtags and mentions
  for line in cnt1:
    if rtw.search(line) is None:
      cnt.append(line)
  # remove mentions and hastags, and URLs
  cnt1 = cnt
  cnt = []
  for line in cnt1:
    cnt.append(tco.sub(' ',htg.sub(' ',mnt.sub(' ',line))))
  ### append emoticon scores ###
  lines = cnt;
  cnt = None;
  ed=importemoticonscoredict(emdicname) # contains integer scores
  sd=importsentixscoredict(sentiworddicname) # contains averaged floating point scores
  for i in list(range(0,len(lines))):
    items = utf16emojis1.findall(lines[i]) + utf16emojis2.findall(lines[i])
    score=0
    # accumulate score for emojis in the score dictionary
    for em in items:
      if em in ed.keys():
        score=score+int(ed[em])
    # remove all emojis in the non-BMP UTF-16 block and append score
    lines[i] = lowlevelclean(re.sub('[\r\n\t]',' ',utf16emojis2.sub(' ',utf16emojis1.sub(' ',lines[i]))))
    # after cleaning, add sentiword contributes to accumulated score
    items = lines[i].split()
    for w in items:
      if w in sd.keys():
        score=score+float(sd[w])
    ### saturation to be done here, or in the saturate method (or even better, set number of classes as input parameter) ###
    lines[i] = [lines[i], str(saturationfunction(score))]
  # write to csv file
  gc.writerows(lines)
  g.close()
def cleanandscore(inname,outname,dicname):
  f = open(inname,'r')
  # explicit output encoding assignment is needed, otherwise Python will choose system's default encoding (Latin-1 for Windows)
  g = codecs.open(outname,'w',encoding='utf-8')
  gc = csv.writer(g,delimiter='\t')
  
  # preserve lines with UTF-16 emoticons
  cnt=[]
  for line in f.readlines():
    if utf16emojis1.search(line) is not None or utf16emojis2.search(line) is not None:
      cnt.append(line)
  f.close()
  # strip duplicate lines
  cnt1 = list(set(cnt))
  cnt = []
  # strip retweets, hashtags and mentions
  for line in cnt1:
    if rtw.search(line) is None:
      cnt.append(line)
  # remove mentions and hastags, and URLs
  cnt1 = cnt
  cnt = []
  for line in cnt1:
    cnt.append(tco.sub(' ',htg.sub(' ',mnt.sub(' ',line))))
  # append emoticon scores
  lines = cnt;
  cnt = None;
  dic=importemoticonscoredict(dicname)
  keys = dic.keys()
  for i in list(range(0,len(lines))):
    items = utf16emojis1.findall(lines[i]) + utf16emojis2.findall(lines[i])
    score=0
    # accumulate score for emojis in the score dictionary
    for em in items:
      if em in keys:
        score=score+int(dic[em])
    # remove all emojis in the non-BMP UTF-16 block and append score
    lines[i] = [re.sub('[\r\n\t]',' ',utf16emojis2.sub(' ',utf16emojis1.sub(' ',lines[i]))) , str(score)]
    # low-level cleaning and line feed restore
    lines[i][0] = lowlevelclean(lines[i][0])
  # write to csv file
  gc.writerows(lines)
  g.close()