Пример #1
0
import fileinput
import re
import sys
import time

from itertools import groupby
from nltk import wordpunct_tokenize
from web import isurl, isquestion, get_files


for key, lines in groupby(get_files(), lambda line: (line[0], line[1])):
  query = key[1]

  if isurl(query):
    continue

  qword = isquestion(query)
  if not qword:
    continue

  for line in lines:
    answers = []
    if len(line) == 3:
      id, query, ts = line
    else:
      id, query, ts, _, web = line
      answers.append(web)

  if len(answers) == 0:
    ans = ''
  else:
Пример #2
0
import fileinput
import sys
import time

from liwc import countcat
from web import isurl, isquestion, get_files

print 'id\tts\tpos\tneg\tword\tis_url\tquestion\tclicked'
for line in get_files():
  if len(line) == 4:
    id, query, ts, _ = line
    clicked = 1
  else:
    id, query, ts, _, _ = line
    clicked = 0
  if id == 'AnonID':
    continue
  # emotion
  counts = countcat(query)
  pos = counts[0]; neg = counts[1]; word = counts[2]
  # internet competence
  is_url = int(isurl(query))
  is_question = 1
  ts = int(time.mktime(time.strptime(ts, '%Y-%m-%d %H:%M:%S')))
  print '\t'.join([str(x) for x in [id, ts, pos, neg, word, is_url, is_question, clicked]])