forked from maksim2042/DC_DataScience_Meetup
-
Notifications
You must be signed in to change notification settings - Fork 0
/
linguist.py
60 lines (46 loc) · 1.43 KB
/
linguist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/usr/bin/env python
# encoding: utf-8
"""
cunning_linguist.py
Created by Maksim Tsvetovat on 2011-12-08.
Copyright (c) 2011 __MyCompanyName__. All rights reserved.
"""
import sys
import os
from english_stoplist import stoplist
import collections as c
import stemming.porter2 as stem
from itertools import islice
import string
import guess_language as gl
table=string.maketrans("","")
"""A couple of util functions for dealing with large dicts"""
def find_key(dic, val):
"""return the key of dictionary dic given the value"""
return [k for k, v in dic.iteritems() if v == val][0]
def find_value(dic, key):
"""return the value of dictionary dic given the key"""
return dic[key]
"""-------------------------------------------------------"""
def strip_punctuation(s):
"""Strip punctuation from a string"""
return s.translate(table, string.punctuation)
def process(text):
try:
lang=gl.guessLanguageName(text)
#print lang
except:
return []
## only keep the English tweets
if lang != 'English':
return []
else:
tokens=[]
for token in text.lower().split(' '):
try:
token=strip_punctuation(token).lower()
except TypeError:
pass
if (token not in stoplist) and (not token.startswith('@')):
tokens.append(stem.stem(token))
return tokens