Пример #1
0
def main():
    tweet_file = open(sys.argv[1])
    dict_file = open(sys.argv[2])

    # Construct the original scores' dictionary.
    scores = {}
    for line in dict_file:
        term, score = line.split("\t")
        scores[term] = int(score)

    newWords = {}

    # For every tweet do the following:
    # Step 1. Calculate scores according to the current state of the dictionary.
    # Step 2. Try to improve scores for words that were not manually labelled.

    for line in tweet_file:
        jline = json.loads(line)

        try:
            text = jline["text"]
            words = text.strip().split(" ")
            words = [
                strip_accents(word.encode('ascii', 'ignore')) for word in words
            ]

        except:
            words = []

        score = 0.0
        cont = 0.0

        for word in words:
            # Step 1. Manually scored words are favored.
            if word in scores.keys():
                score += scores[word]
                cont += 1.0
            elif word in newWords.keys():
                score += newWords[word]
                cont += 1.0

        if cont > 0:
            # Step 2. Simple update for new words.
            for word in words:
                if word in newWords.keys():
                    newWords[word] = .8 * newWords[word] + .2 * score / cont
                else:
                    newWords[word] = score / cont

    for key in scores.keys():
        print key + "\t" + str(scores[key])

    for key in newWords.keys():
        if not key in scores.keys():
            print key + "\t" + str(newWords[key])
Пример #2
0
def main():
	tweet_file = open(sys.argv[1])
	dict_file  = open(sys.argv[2])

	# Construct the original scores' dictionary.
	scores = {}
	for line in dict_file:
		term, score  = line.split("\t")
		scores[term] = int(score)

	newWords = {}

	# For every tweet do the following:
	# Step 1. Calculate scores according to the current state of the dictionary.
	# Step 2. Try to improve scores for words that were not manually labelled.

	for line in tweet_file:
		jline = json.loads(line)

		try:
			text  = jline["text"]
			words = text.strip().split(" ")
			words = [strip_accents(word.encode('ascii', 'ignore')) for word in words]

		except:
			words = []

		score = 0.0
		cont  = 0.0

		for word in words:
			# Step 1. Manually scored words are favored.
			if word in scores.keys():
				score += scores[word]
				cont  += 1.0
			elif word in newWords.keys():
				score += newWords[word]
				cont  += 1.0

		if cont > 0:
			# Step 2. Simple update for new words.
			for word in words:
				if word in newWords.keys():
					newWords[word] = .8*newWords[word] + .2*score/cont
				else:
					newWords[word] = score/cont

	for key in scores.keys():
		print key + "\t" + str(scores[key])

	for key in newWords.keys():
		if not key in scores.keys():
			print key + "\t" + str(newWords[key])
Пример #3
0
def main():
    tweet_file = open(sys.argv[1])
    dict_file = open(sys.argv[2])

    # Construct the original scores' dictionary.
    scores = {}
    for line in dict_file:
        try:
            term, score = line.split("\t")
            scores[term] = float(score)
        except:
            pass

    newWords = {}

    # For every tweet do the following:
    # Step 1. Calculate scores according to the current dictionary.
    # Step 2. If the tweet was given a score, print the coordinates and the score.

    for line in tweet_file:
        jline = json.loads(line)
        try:
            text = jline["text"]
            words = text.split(" ")
            words = [
                strip_accents(word.encode('ascii', 'ignore')) for word in words
            ]

        except:
            words = []

        score = 0.0
        cont = 0.0

        for word in words:
            if word in scores.keys():
                score += scores[word]
                cont += 1.0

        if cont > 0:
            # Step 2. Location and score are printed.
            try:
                if jline["place"]["country_code"] == 'MX':
                    text = jline["text"]
                    x, y = jline["geo"]["coordinates"]
                    name = jline["user"]["screen_name"]
                    userId = "id" + jline["user"]["id_str"]
                    print userId, ", ", y, ", ", x, ",", score
            except:
                pass
Пример #4
0
def main():
    english_file = open(sys.argv[1])
    spanish_file = open(sys.argv[2])
    scores = {}

    for line in spanish_file:
        line = line.strip()
        term, score = next(english_file).split("\t")

        # For simplicity, avoid phrases and use only words.
        if len(line.split(" ")) == 2:
            word, score2 = line.split(" ")
            scores[strip_accents(word)] = int(score)

    for key in scores.keys():
        print key + "\t" + str(scores[key])
Пример #5
0
def main():
	english_file = open(sys.argv[1])
	spanish_file = open(sys.argv[2])
	scores = {}

	for line in spanish_file:
		line = line.strip()
		term, score = next(english_file).split("\t")

		# For simplicity, avoid phrases and use only words.
		if len(line.split(" ")) == 2:
			word, score2 = line.split(" ")
			scores[ strip_accents(word)] = int(score)

	for key in scores.keys():
		print key + "\t" + str(scores[key])