示例#1
0
def process_semeval(data_folder, outfname, idfname):
    """
    Process dataset of SemEval 2013 task 2 subtask B
    Only keep postive and negative tweets
    Format: tweet|label,user_id,split\n
    """
    logger.info("start processing tweets for SemEval 2013 task 2 subtask B")
    tid_uid_map = {}
    with open(idfname, "rb") as f:
        for line in f:  
            parts = line.strip().split()
            tid_uid_map[parts[0]] = parts[1]

    train_file, val_file, test_file = data_folder[0], data_folder[1], data_folder[2]
    fout = open(outfname, "w")
    with open(train_file, "rb") as f:
        for line in f:  
            parts = line.strip().split("\t")
            label = parts[2]
            tweet = parts[3]
            userid = "unknown"
            if parts[0] in tid_uid_map: userid = tid_uid_map[parts[0]]
            if tweet == "Not Available": continue
            tweet = clean_tweet(tweet)
            tweet = clean_tweet_toks(tokenize(tweet))
            if label == "positive":
                fout.write(tweet + "|1," + userid + ",1\n")
            elif label == "negative":
                fout.write(tweet + "|0," + userid + ",1\n")
    with open(val_file, "rb") as f:
        for line in f:  
            parts = line.strip().split("\t")
            label = parts[2]
            tweet = parts[3]
            userid = "unknown"
            if parts[0] in tid_uid_map: userid = tid_uid_map[parts[0]]
            if tweet == "Not Available": continue
            tweet = clean_tweet(tweet)
            tweet = clean_tweet_toks(tokenize(tweet))
            if label == "positive":
                fout.write(tweet + "|1," + userid + ",2\n")
            elif label == "negative":
                fout.write(tweet + "|0," + userid + ",2\n")
    with open(test_file, "rb") as f:
        for line in f:  
            parts = line.strip().split("\t")
            label = parts[2]
            tweet = parts[3]
            userid = "unknown"
            if parts[0] in tid_uid_map: userid = tid_uid_map[parts[0]]
            if tweet == "Not Available": continue
            tweet = clean_tweet(tweet)
            tweet = clean_tweet_toks(tokenize(tweet))
            if label == "positive":
                fout.write(tweet + "|1," + userid + ",3\n")
            elif label == "negative":
                fout.write(tweet + "|0," + userid + ",3\n")
    fout.close()
    logger.info("finish processing data")
示例#2
0
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Every dataset is lower cased except for TREC
    """
    # string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = twokenize.tokenize(string.lower())
    for i in range(len(string)):
        if string[i][0] == '@':
            string[i] = ""
        elif string[i][0:4] =="http":
            string[i]=""
    string = " ".join(string)


    """
    reducing repeated char
    """
    a=string[0]    
    b=string[1]
    newStr=a+b
    
    for i in range(len(string)-2):
        c=string[i+2]
        if(a==b and b==c):
            pass
        else:
            newStr=newStr+c
        a=b
        b=c
    string = newStr
    
    string = string.replace("`","\'")
    string = string.replace("\u002c",",")
    string = string.replace("\u2019","\'")
    string = string.replace("\\\"\"","\"")
    
    '''
    string = re.sub(r"[^A-Za-z0-9(),!?]", " ", string)     
    
    string = re.sub(r"\'s", " \'s", string) 
    string = re.sub(r"\'ve", " \'ve", string) 
    string = re.sub(r"n\'t", " n\'t", string) 
    string = re.sub(r"\'re", " \'re", string) 
    string = re.sub(r"\'d", " \'d", string) 
    string = re.sub(r"\'ll", " \'ll", string) 
    string = re.sub(r",", " , ", string) 
    string = re.sub(r"!+", " ! ", string) 
    string = re.sub(r"\(", " \( ", string) 
    string = re.sub(r"\)", " \) ", string) 
    string = re.sub(r"\?", " \? ", string) 
    string = re.sub(r"\s{2,}", " ", string)
    '''
    
    string = twokenize.tokenize(string)
    return " ".join(string)
示例#3
0
def process_omd(fname, outfname, idfname):
    """
    Process OMD dataset: each tweet is associated with three votings which 1:neg 2:pos
    Only keep postive and negative tweets according two 2/3 agreement
    Format: tweet|label\n
    """
    logger.info("start processing tweets for OMD")
    tid_uid_map = {}
    with open(idfname, "rb") as f:
        for line in f:  
            parts = line.strip().split()
            tid_uid_map[parts[0]] = parts[1]

    fout = open(outfname, "w")
    with open(fname, "rb") as f:
        for line in f:  
            parts = line.strip().split("\t")
            votes = [int(parts[-1]), int(parts[-2]), int(parts[-3])]
            votes.sort()
            if votes[0] == 1 and votes[1] == 1: label = "0"
            elif votes[1] == 2 and (votes[0] == 2 or votes[2] == 2): label = "1"
            else: continue
            tweet = parts[2]
            userid = "unknown"
            if parts[0] in tid_uid_map: userid = tid_uid_map[parts[0]]
            if tweet[0] == "\"": tweet = tweet[1:].strip()
            if tweet[-1] == "\"": tweet = tweet[:-1].strip()
            tweet = clean_tweet(tweet)
            tweet = clean_tweet_toks(tokenize(tweet))
            fout.write(tweet + "|" + label +  "," + userid + "\n")
    fout.close()
    logger.info("finish processing data")
def perprocessing(tdic):
    new_dic = {}
    POS_feature = []
    for line in tdic:
        id = line
        gt = tdic[line][0]
        raw = ' '.join(twokenize.tokenizeRawTweetText(tdic[line][1]))
        text = twokenize.normalizeTextForTagger(raw)
        text_tk = twokenize.tokenize(text)
        # print(text_tk)
        print(text_tk)
        telist = []
        for word in text_tk:
            word = word.lower()
            # ps = nltk.stem.PorterStemmer()
            # word = ps.stem(word)
            telist.append(word)
        # print(telist)
        afterlemma = lemma(telist)
        telist = afterlemma[0]
        POS_feature.append(afterlemma[1])
        # print(telist)
        newtext = ' '.join(telist)
        # print(newtext)
        newtext = textPreprocessor01.replaceall(newtext)  #now preprocess . change to URLINK SADFACE
        print(newtext)
        new_dic[id] = gt, newtext
    return new_dic, np.array(POS_feature)
示例#5
0
def kshinglize(s, k=KSHINGLES, stopwords=STOPWORDS):
    """ Tokenizes string s, removes stopwords, and returns a set of k-shingles
    """
    s = s.strip().lower()
    tokens_raw = twokenize.tokenize(s)
    tokens = filterstopwords(tokens_raw, stopwords)
    return tokens_to_kshingles(tokens, k)
示例#6
0
 def __init__(self, testData):
     self.labeledTweets = []
     for line in open(testData):
         line = line.rstrip('\n')
         fields = line.split('\t')
         fields[6] = ' '.join(twokenize.tokenize(fields[6]))
         self.labeledTweets.append(fields)
示例#7
0
def prepare_and_tokenize(text,
                         url_scheme='st',
                         strip_word_padding=False,
                         alphanumeric_only=False):
    """ 
        Prepares the raw tweet text for tokenisation, then tokenizes it using twokenize (https://github.com/ianozsvald/ark-tweet-nlp-python)
    """
    #handle URLS
    # possible schemes 'st': single token, 'leave'
    if url_scheme != 'leave':
        text = re.sub(url_re, "hyperlinktoken", text)
    text = re.sub(mention_re, "mentiontoken", text)

    # reduce extended words to a shorter token. e.g. "reeeeeeeeeee"->"ree", "hahahaha" -> "haha"
    if strip_word_padding:
        #TODO: Make this actually work
        text = re.sub(word_pad_re, "", text)

    # strip out non AN characters (except # and @)
    if alphanumeric_only:
        text = re.sub(alphanumeric_only_re, "", text)

    # add spaces between emoji
    for match in list(set(emoji_re.findall(text))):
        text = text.replace(match, " " + match + " ")

    return tokenize(text)
示例#8
0
def preprocess(m, sep_emoji=False):
    m = m.lower()    
    m = max_reps(m)
    #replace user mentions with token '@user'
    user_regex = r".?@.+?( |$)|<@mention>"    
    m = re.sub(user_regex," @user ", m, flags=re.I)
    #replace urls with token 'url'
    m = re.sub(twokenize.url," url ", m, flags=re.I)        
    tokenized_msg = ' '.join(twokenize.tokenize(m)).strip()
    if sep_emoji:
        #tokenize emoji, this tokenzier however has a problem where repeated punctuation gets separated e.g. "blah blah!!!"" -> ['blah','blah','!!!'], instead of ['blah','blah','!','!','!']
        m_toks = tokenized_msg.split()
        n_toks = twk.tokenize(tokenized_msg)         
        if len(n_toks)!=len(m_toks):
            #check if there is any punctuation in this string
            has_punct = map(lambda x:x in twk.punctuation, n_toks)
            if any(has_punct):  
                new_m = n_toks[0]
                for i in xrange(1,len(n_toks)):
                    #while the same punctuation token shows up, concatenate
                    if has_punct[i] and has_punct[i-1] and (n_toks[i] == n_toks[i-1]):
                        new_m += n_toks[i]
                    else:
                        #otherwise add space
                        new_m += " "+n_toks[i]                   
                tokenized_msg = new_m                
    return tokenized_msg.lstrip()
示例#9
0
def main(argv):

    if len(sys.argv) != 3:
        print("Usage:> python getTaggedFile.py infile.txt outfile.txt")
        exit()

    infile_name = str(sys.argv[1])
    outfile_name = str(sys.argv[2])

    infile = open(infile_name, 'r')
    outfile = open(outfile_name, 'w')

    tagger = PerceptronTagger()

    print("Reading file...")
    line = infile.readline()

    while line != '':
        # Use Twokenizer for twitter parser
        tagset = None
        tokens = tokenize(line)
        tags = nltk.tag._pos_tag(tokens, tagset, tagger)
        outfile.write(format_tagged(tags))
        line = infile.readline()

    # close file and connection
    infile.close()
    outfile.close()
    print("Finished tagging... Closing files.")
示例#10
0
    def Extract(self, text):
        features = []
        words = twokenize.tokenize(text)

        #hand-crafted features
        iCapitalized = True
        nCapitalized = 0.1
        nAllCaps = 0.1
        nCapLowerViolated = 0.1
        nCapUpperViolated = 0.1
        nWords = 0.1
        for i in range(len(words)):
            capitalized = re.search(r'^([A-Z]|[a-z][A-Z])', words[i])

            if capitalized and not (i == 0 or re.match(r"\.|\?|!|@.+|http:.+|:|\"", words[i-1])):
                nCapitalized += 1.0

            if not (i == 0 or re.match(r"\.|\?|!|@.+|http:.+|:|\"", words[i-1])):
                if capitalized and self.capDict.get(words[i].lower(), '1') != '1':
                    nCapUpperViolated += 1.0
                    features.append(self.fVocab.GetID('upperViolated=%s' % words[i].lower()))
                elif not capitalized and re.match(r'[a-z]+', words[i]) and self.capDict.get(words[i].lower(), '1') != '0':
                    nCapLowerViolated += 1.0
                    #features.append(self.fVocab.GetID('lowerViolated=%s' % words[i].lower()))
                if re.match(r'\w+', words[i][0:1]):
                    nWords += 1
            if re.match(r"i|i'm|im|u", words[i]):
                iCapitalized = False
            if re.match(r"[A-Z]{2,}", words[i]):
                nAllCaps += 1
                
        features.append(self.fVocab.GetID('iCapitalized=%s' % iCapitalized))

        return ' '.join(["%s:1" % x for x in features]) + " %s:%s" % (self.fVocab.GetID('nAllCaps'), nAllCaps/nWords) + " %s:%s" % (self.fVocab.GetID('nCapitalized'), nCapitalized/nWords) + " %s:%s" % (self.fVocab.GetID('nCapLowerViolated'), nCapLowerViolated/nWords) + " %s:%s" % (self.fVocab.GetID('nCapUpperViolated'), nCapUpperViolated/nWords)
    def take_into_account_negation(self, tweet):
        neg_pattern = re.compile(
            'never|nothing|nowhere|noone|none|not|havent|hasnt|hadnt|cant|couldnt|shouldnt|wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint|no|'
            +
            'n\'t|haven\'t|haven\'t|hasn\'t|hadn\'t|can\'t|couldn\'t|shouldn\'t|won\'t|wouldn\'t|don\'t|doesn\'t|didn\'t|isn\'t|aren\'t',
            re.IGNORECASE)
        clause_pattern = re.compile(r'^[.:;!?]$')

        neg = re.search(neg_pattern, tweet)
        if neg != None:
            #print 'Negation in tweet: ' + tweet
            pattern = tweet[neg.start():]
            end = re.search(clause_pattern, pattern)
            if end == None:
                end_str = len(tweet)
            else:
                end_str = end.start()
                end_str = int(end_str) - 1
            negated = ''

            tokens = twokenize.tokenize(pattern[:end_str])
            for w in tokens:
                negated += w + '_neg '
            negated = tweet[:neg.start()] + negated
            #print 'Negation in tweet: ' + negated
        else:
            negated = tweet
        return negated
示例#12
0
def preprocess(m, sep_emoji=False):
    m = m.lower()
    m = max_reps(m)
    #replace user mentions with token '@user'
    user_regex = r".?@.+?( |$)|<@mention>"
    m = re.sub(user_regex, " @user ", m, flags=re.I)
    #replace urls with token 'url'
    m = re.sub(twokenize.url, " url ", m, flags=re.I)
    tokenized_msg = ' '.join(twokenize.tokenize(m)).strip()
    if sep_emoji:
        #tokenize emoji, this tokenzier however has a problem where repeated punctuation gets separated e.g. "blah blah!!!"" -> ['blah','blah','!!!'], instead of ['blah','blah','!','!','!']
        m_toks = tokenized_msg.split()
        n_toks = twk.tokenize(tokenized_msg)
        if len(n_toks) != len(m_toks):
            #check if there is any punctuation in this string
            has_punct = map(lambda x: x in twk.punctuation, n_toks)
            if any(has_punct):
                new_m = n_toks[0]
                for i in xrange(1, len(n_toks)):
                    #while the same punctuation token shows up, concatenate
                    if has_punct[i] and has_punct[i - 1] and (
                            n_toks[i] == n_toks[i - 1]):
                        new_m += n_toks[i]
                    else:
                        #otherwise add space
                        new_m += " " + n_toks[i]
                tokenized_msg = new_m
    return tokenized_msg.lstrip()
示例#13
0
    def build_dict(self, corpus, word=True, which_grams=None):
        """
        Builds the necessary ngrams out of the corpus
        Word is set to True by default which builds word ngrams
        If set to False, will build character ngrams
        Which_grams is the n values of the ngrams. By default will
        create unigrams, bigrams, and trigrams for words and
        bigrams, trigrams, four-grams and five-grams for characters
        """
        dct = UnigramDictionary()
        if word:
            which_grams = [1,2,3]
        else:
            which_grams = [2,3,4,5]

        for text in corpus:
            if word:
                tokens = twokenize.tokenize(text)
            else:
                tokens = list(text)
            
            #list of tokens, each index is the zipped object of 
                        # tokens for the given n
            all_tokens = [ find_ngrams(tokens, n) for n in which_grams ]

            for j in all_tokens:
                for token in j:
                    dct.add(token)

        if word:
            self.word_ngram = dct
        else:
            self.char_ngram = dct

        return dct
示例#14
0
def perprocessing(tdic):
    new_dic = {}
    for line in tdic:
        id = line
        gt = tdic[line][0]
        raw = ' '.join(twokenize.tokenizeRawTweetText(tdic[line][1]))
        text = twokenize.normalizeTextForTagger(raw)
        text_tk = twokenize.tokenize(text)
        telist = []
        for word in text_tk:
            word = word.lower()
            ps = nltk.stem.PorterStemmer()
            word = ps.stem(word)
            # word = nltk.stem.SnowballStemmer(word)
            telist.append(word)
        # 	return ''.join(ans)
        # newtext = ?telist
        # newtext = ' '.join(text_tk)
        newtext = ' '.join(telist)
        # print(newtext)
        newtext = textPreprocessor01.replaceall(newtext)
        new_dic[id] = gt, newtext
        # print(type(tdic[line][1]))
        # print(line)
        # print(type(line))
        # print(type(newtext))
        # print(newtext)
    return new_dic
示例#15
0
def main(argv):

    tagger = PerceptronTagger()
    tagset = None
    tokens = tokenize(line)
    tags = nltk.tag._pos_tag(tokens, tagset, tagger)
    format_tagged(tags)
示例#16
0
文件: search.py 项目: zaycev/n7
 def learn_terms(self, tweets_file_object, learn_lemmas=True, cache_size=1000000):
     reader = csv.reader(tweets_file_object, delimiter=",", quotechar="\"")
     term_freq = Counter()
     term_id_map = dict()
     tweet_vectors = []
     for row in reader:
         tweet_id = int(row[0])
         tweet_text = row[-1]
         terms = [t.lower().encode("utf-8") for t in twokenize.tokenize(tweet_text)]
         if learn_lemmas:
             terms = [self.lmtz.lemmatize(term) for term in terms]
         tweet_sp_vector = []
         counted_ids = []
         for term in terms:
             if term not in term_id_map:
                 term_id = len(term_id_map)
                 term_id_map[term] = term_id
             else:
                 term_id = term_id_map[term]
             if term_id not in counted_ids:
                 term_freq[term_id] += 1
                 counted_ids.append(term_id)
             tweet_sp_vector.append(term_id)
         tweet_vectors.append((tweet_id, tweet_sp_vector))
         if len(tweet_vectors) >= cache_size:
             self.write_tweet_vectors(tweet_vectors)
             tweet_vectors = []
     self.write_tweet_vectors(tweet_vectors)
     self.write_terms(term_id_map, term_freq)
示例#17
0
    def Extract(self, text):
        features = []
        words = twokenize.tokenize(text)

        #hand-crafted features
        iCapitalized = True
        nCapitalized = 0.1
        nAllCaps = 0.1
        nCapLowerViolated = 0.1
        nCapUpperViolated = 0.1
        nWords = 0.1
        for i in range(len(words)):
            capitalized = re.search(r'^([A-Z]|[a-z][A-Z])', words[i])

            if capitalized and not (i == 0 or re.match(r"\.|\?|!|@.+|http:.+|:|\"", words[i-1])):
                nCapitalized += 1.0

            if not (i == 0 or re.match(r"\.|\?|!|@.+|http:.+|:|\"", words[i-1])):
                if capitalized and self.capDict.get(words[i].lower(), '1') != '1':
                    nCapUpperViolated += 1.0
                    features.append(self.fVocab.GetID('upperViolated=%s' % words[i].lower()))
                elif not capitalized and re.match(r'[a-z]+', words[i]) and self.capDict.get(words[i].lower(), '1') != '0':
                    nCapLowerViolated += 1.0
                    #features.append(self.fVocab.GetID('lowerViolated=%s' % words[i].lower()))
                if re.match(r'\w+', words[i][0:1]):
                    nWords += 1
            if re.match(r"i|i'm|im|u", words[i]):
                iCapitalized = False
            if re.match(r"[A-Z]{2,}", words[i]):
                nAllCaps += 1
                
        features.append(self.fVocab.GetID('iCapitalized=%s' % iCapitalized))

        return ' '.join(["%s:1" % x for x in features]) + " %s:%s" % (self.fVocab.GetID('nAllCaps'), nAllCaps/nWords) + " %s:%s" % (self.fVocab.GetID('nCapitalized'), nCapitalized/nWords) + " %s:%s" % (self.fVocab.GetID('nCapLowerViolated'), nCapLowerViolated/nWords) + " %s:%s" % (self.fVocab.GetID('nCapUpperViolated'), nCapUpperViolated/nWords)
示例#18
0
 def __init__(self, testData):
     self.labeledTweets = []
     for line in open(testData):
         line = line.rstrip('\n')
         fields = line.split('\t')
         fields[6] = ' '.join(twokenize.tokenize(fields[6]))
         self.labeledTweets.append(fields)
示例#19
0
文件: decoder.py 项目: BBN-E/Hume
def get_sentences(line, content_type):
    global spacy_en

    if content_type == 'SocialMediaPosting':
        sentences = []
        start_offset = 0
        sent = []

        for token in twokenize.tokenize(line[:-1]):
            idx = line.index(token, start_offset)
            sent.append(Token(token, idx))
            start_offset = idx + len(token)

        sentences.append(sent)
        return sentences

    elif content_type == 'Blog' or content_type == 'NewsArticle' or content_type == 'Post':
        try:
            spacy_doc = spacy_en(line)
        except:
            spacy_en = spacy.load('en')
            print('**** Loaded spacy en')
            spacy_doc = spacy_en(line)

        return spacy_doc.sents
    def extract_more_decep_tech_features(self, tweets, vocab_file):
        #print 'Extracting decep_tech/decep_type features with training vocab'
        train_vocab = {}
        k = 0
        for line in open(vocab_file):
            train_vocab[line.strip()] = k
            k += 1
    #print 'Train vocab size=>' + str(len(train_vocab))

        cv = CountVectorizer(ngram_range=(1, 1),
                             binary=True,
                             vocSuraiyalary=train_vocab)
        train_features_bow = cv.fit_transform(tweets)

        add_decep_tech_matrix = []
        hash_pattern = re.compile('\#+[\w_]+[\w\'_\-]*[\w_]+')
        elong_pattern = re.compile("([a-zA-Z])\\1{2,}")
        caps_pattern = re.compile(('[A-Z][A-Z\d]+'))
        punc_pattern = re.compile('([.,!?]+)')

        for tweet in tweets:
            tweet_vector = []
            tokens = twokenize.tokenize(tweet)
            #count the number of elongated tokens
            n_elong = len(re.findall(elong_pattern, tweet))

            #count the number of all_caps tokens
            n_caps = len(re.findall(caps_pattern, tweet))

            #count the number of repeated punctuation
            n_rep_punct = len(re.findall(punc_pattern, tweet))

            #count the number of hasgtags
            n_hahtag = len(re.findall(hash_pattern, tweet))

            #check if the tweets has SAD, HAPPY, BOTH_SH or NA emoticon
            emoticon_mood = emoticons.analyze_tweet(tweet.strip())
            if emoticon_mood == 'NA':
                emoticon_mood = 0
            elif emoticon_mood == 'HAPPY':
                emoticon_mood = 2
            elif emoticon_mood == 'SAD':
                emoticon_mood = 1
            elif emoticon_mood == 'BOTH_HS':
                emoticon_mood = 4
            tweet_vector = [
                n_elong, n_caps, n_rep_punct, n_hahtag, emoticon_mood
            ]
            add_decep_tech_matrix.append(tweet_vector)

    #print np.asarray(add_decep_tech_matrix).shape
        a = np.asarray(add_decep_tech_matrix)
        #print 'additional 5 features: ' + str(a)

        sa = sparse.csr_matrix(add_decep_tech_matrix)
        features = hstack([sa, train_features_bow])
        #print 'final feature matrix size: ' + str(features.shape)

        return features
	def process(self,text):
		
		tTweet = ""
		for word in text.split():
			if "#" in word:
				word = word.replace("#"," ")
				f=0
				for tt in self.remove:
					if tt in word:
						f=1
				if f==1:
					continue
			tTweet = " ".join([tTweet,word])
			tTweet = tTweet.strip()

		tempTweet = ""
		for word in twokenize.tokenize(tTweet):
			if word != " " and word not in self.stop and not word.isdigit():
				word = word.strip().lower()
				if len(word) > 26:
					word=word[:27]
				#### Normalize Emoticons
				try:
					word = self.emoticons[word]
				except:
					#Normalize Acronyms
					try:
						try:
							if  self.wordDict[word] ==1:
								word = word
						except:
							word = self.acronyms[word]
					except:
					#Normalize Contractions
						try:
							word = self.contractions[word]
						except:
							#Normalize words (Spell)
							try:
								if self.wordDict[word] == 1:
									word =	word
							except:
								CW = self.correct(word)
								if "@" in word or "#" in word:
									word = word
								else:
									if CW != "a":
										word = CW
				if "@" in word:
					word="@user"
				tempTweet = " ".join([tempTweet,word.strip()])
				tempTweet = tempTweet.lower().strip()
		tempTweet = " ".join(stemmer.stem(w) for w in tempTweet.split(" ") if w not in self.stop)
		#print(tempTweet.encode("utf-8"))
		return(tempTweet)

##Usage
# pre = Preprocess()
# pre.process("lol god pls help with my hw :) :(:D")
def process_line(s, clean_string=True):
    if clean_string:
        s = clean_str(s)
    tokens = tokenize(s)
    #return [process_token(None,token).lower() for token in tokens]
    sent = nltk.pos_tag(tokens)
    chunks = nltk.ne_chunk(sent, binary=False)
    return [process_token(c,token).lower().encode('UTF-8') for c,token in map(None, chunks, tokens)]
def splitTweet2Sents(tweet):
    tweet = re.sub(u'\u201c', '\"', tweet)
    tweet = re.sub(u'\u201d', '\"', tweet)
    tweet = tweet.encode('ascii', 'ignore')
    tokenizedtweet = u" ".join(tokenize(tweet))
    cleantweet = filterTweetText(tokenizedtweet)
    sents = sentSplitter(cleantweet)
    return sents
示例#24
0
def tokenize(text):
    stemmer = PorterStemmer()
    # lmtzr = WordNetLemmatizer()
    tokens = twokenize.tokenize(text)
    tokens_clean = [s for s in tokens if s not in set(string.punctuation)]
    # tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens_clean, stemmer)
    return stems
示例#25
0
def preprocess(m):
    m = m.lower()
    m = max_reps(m)
    # replace user mentions with token '@user'
    user_regex = r".?@.+?( |$)|<@mention>"
    m = re.sub(user_regex, " @user ", m, flags=re.I)
    # replace urls with token 'url'
    m = re.sub(twokenize.url, " url ", m, flags=re.I)
    return twokenize.tokenize(m)
示例#26
0
文件: pmi.py 项目: zaycev/n7
def all_tokens(tweetreader):
    i = 0
    for r in tweetreader:
        i += 1
        tokens = tokenize(r[-1])
        for t in tokens:
            yield t
        if i >= 50000:
            return
def process_line(s, clean_string=True):
    """
    Processes a line by iteratively calling process_token.
    """
    if clean_string:
        s = clean_str(s)
    tokens = tokenize(s)
    sent = nltk.pos_tag(tokens)
    chunks = nltk.ne_chunk(sent, binary=False)
    return [process_token(c,token).lower().encode('UTF-8') for c,token in map(None, chunks, tokens)]
def run_all():

    csv.field_size_limit(sys.maxsize)

    psql_conn = psycopg2.connect("dbname='tweet'")
    psycopg2.extras.register_hstore(psql_conn)
    pg_cur = psql_conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
    
    # Build up the bins-to-nghds mapping so we can easily translate.
    bins_to_nghds = {}
    for line in DictReader(open('point_map.csv')):
        bins_to_nghds[(float(line['lat']), float(line['lon']))] = line['nghd']
    #get nghd to zone mapping
    nghds_to_zones = {}
    for line in DictReader(open('zone_map.csv')):
        nghds_to_zones[line['nghd']] = line['zone']

    words_per_zone = json.load(open('outputs/zone_words.json'))
    top10words = {}
    tweets_per_word = defaultdict(lambda: defaultdict(list))
   
    for zone in words_per_zone:
        top10words[zone] = words_per_zone[zone]["top words"]

    pg_cur.execute("SELECT text, ST_ASGEOJSON(coordinates), user_screen_name " + 
                                                        "FROM tweet_pgh;")
    counter = 0
    for row in pg_cur:
        counter += 1
        if (counter % 10000) == 0:
            print str(counter) + ' tweets processed'
        coords = json.loads(row[1])['coordinates']
        bin = util.util.round_latlon(coords[1], coords[0])
        if bin in bins_to_nghds:
            tweet_nghd = bins_to_nghds[bin]
        else:
            tweet_nghd = 'Outside Pittsburgh'
        if tweet_nghd in nghds_to_zones:
            zone = "Zone " + nghds_to_zones[tweet_nghd]
        else:
            zone = tweet_nghd
        tweet = row[0]
        tweet = tweet.replace('“','"').replace('”','"')
        tweet = unicode(tweet, errors='ignore')
        username = row[2]
        wordList = twokenize.tokenize(tweet)
        wordList = map(lambda x:x.lower(),wordList) 
        for word in top10words[zone]:
            if word in wordList:
                tweets_per_word[zone][word].append(username + ": " + tweet)
   
    print "writing to JSON file"

    with open('outputs/tweets_per_zoneword.json','w') as outfile:
        json.dump(tweets_per_word,outfile, indent=2)
示例#29
0
def tokenize_str(istring):
    ostring = []
    for line in istring.split('\n'):
        try:
            ostring.append(u" ".join(twokenize.tokenize(
                line[:])).encode('utf-8'))
        except Exception as e:
            print e
            print line

    return '\n'.join(ostring)
示例#30
0
 def process_statuses(self, statuses):
     statuses = [twokenize.tokenize(s.text.lower()) for s in statuses]
     for s in xrange(len(statuses)):
         w = 1
         while True:
             if w >= len(statuses[s]):
                 break
             if statuses[s][w][0] == "'":
                 statuses[s] = statuses[s][:w-1] + [statuses[s][w-1] + statuses[s][w]] + statuses[s][w+1:]
                 w = 0
             w += 1
     return statuses
示例#31
0
def get_idx_from_sent(sent, word_idx_map, k):
    """
    Transforms sentence into a list of indices. Pad with zeroes.
    """
    x = []
    words = tokenize(sent)
    for word in words:
        if word in word_idx_map:
            x.append(word_idx_map[word])
        else:
            x.append(word_idx_map[UNK_TOKEN])
    return x
示例#32
0
def test(text):
    newtext = text.lower()
    newtext = replaceURLs(newtext)
    newtext = replaceUserMentions(newtext)
    newtext = replacenonalpha(newtext)
    newtext = replacenumbers(newtext)
    newtext = replaceshort(newtext)
    newtext = replacespace(newtext)
    newtext = newtext.strip()
    newtext = twokenize.tokenize(newtext)
    
    return newtext
示例#33
0
    def vectorize(self, text, embeddings=True, ngrams=True):
        """
        Returns the feature vector for a given text
        """
        word_tokens = twokenize.tokenize(text)
        char_tokens = list(text)

        #Don't do anything if the necessary data isnt there
        #if self.char_ngram is None:
        #    print("Missing character n-grams")
        #    return
        #if self.word_ngram is None:
        #    print("Missing word n-grams")
        #    return

        if ngrams:         
            word_features = find_ngram_ft_vec(word_tokens, self.word_ngram)
            char_features = find_ngram_ft_vec(char_tokens, self.char_ngram, which_grams = [2,3,4,5])

        if embeddings:
            local_w_vects = [ self.word_vectors[w] for w in word_tokens if w in self.word_vectors]
            if local_w_vects == []:
                word_embding = self.avg_embd
            else:
                word_embding = csr_matrix(np.mean(local_w_vects, axis=0))

        # total_vector = None
        # count = 0
        # for w in word_tokens:
        #     if w in self.word_vectors:
        #         count += 1
        #         if total_vector is None:
        #             total_vector = self.word_vectors[w]
        #         else:
        #             total_vector += self.word_vectors[w]

        # word_embding = coo_matrix(np.divide(total_vector, count))

        if embeddings and ngrams:
            feature_vect = hstack((word_features, char_features))
            feature_vect = hstack((feature_vect, word_embding))

        elif embeddings:
            feature_vect = word_embding

        elif ngrams:
            feature_vect = hstack((word_features, char_features))

        else:
            print("Do you not want anything?")
        #feature_vect = hstack((feature_vect, word_embding))

        return feature_vect
示例#34
0
def get_idx_from_sent(sent, word_idx_map, k):
    """
    Transforms sentence into a list of indices. Pad with zeroes.
    """
    x = []
    words = tokenize(sent)
    for word in words:
        if word in word_idx_map:
            x.append(word_idx_map[word])
        else:
            x.append(word_idx_map[UNK_TOKEN])
    return x
示例#35
0
def tokenize(istring, ostring):
    # print 'this is mytokenizer.py
    ifile = open(istring, 'r')
    ofile = open(ostring, 'w')
    for line in ifile:
        try:
            ofile.write(
                u" ".join(twokenize.tokenize(line[:])).encode('utf-8') + '\n')
        except:
            print line
    ofile.close()
    ifile.close()
示例#36
0
def preprocess(tweet):
    abbv_dict = json.load(open("../other/abbreviations.json"))
    emo_lexica_dict = json.load(open("../other/emoticons.json"))
    for emoticon in emo_lexica_dict[u'emoticons']:
        abbv_dict[emoticon] = ' '
    for word in emo_lexica_dict[u'words']:
        abbv_dict[word] = ' '
    hash_transformer = Transformer.HashtagTransformer()
    sub_transformer = Transformer.SubstitutionTransformer(abbv_dict)
    preprocessor = Preprocessor([hash_transformer, sub_transformer])
    tweet = ' '.join(tokenize(tweet))
    tweet = preprocessor.transform(tweet)
    return tweet
示例#37
0
def tokenize_and_clean(msg, alignments):
  if alignments: 
    toks = twokenize.tokenize(msg)
  else:          
    toks = twokenize.simple_tokenize(msg)
  for i in range(len(toks)):
    toks[i] = toks[i].lower()
  inds = range(len(toks))
  #if len(inds) < len(toks): print "dropping junk", sorted(list(toks[i] for i in (set(range(len(toks)))-set(inds))))
  if alignments: 
    return toks.subset(inds)
  else:
    return [toks[i] for i in inds]
示例#38
0
def normalize_tweet(text, lowercase=False, rm_digits=False, return_tokens=False):
    if lowercase:
        text = text.lower()
    text = re.sub(URL_PATTERN, 'URL', text)
    tokens = twokenize.tokenize(text)
    if return_tokens:
        if rm_digits:
            tokens = map(lambda tk: re.sub(NUM_PATTERN, 'NUM', tokens))
        return tokens
    clean = ' '.join(tokens)
    if rm_digits:
        re.sub(NUM_PATTERN, 'NUM', clean)
    return clean
示例#39
0
def tokenize_and_clean(msg, alignments):
    if alignments:
        toks = twokenize.tokenize(msg)
    else:
        toks = twokenize.simple_tokenize(msg)
    for i in range(len(toks)):
        toks[i] = toks[i].lower()
    inds = range(len(toks))
    #if len(inds) < len(toks): print "dropping junk", sorted(list(toks[i] for i in (set(range(len(toks)))-set(inds))))
    if alignments:
        return toks.subset(inds)
    else:
        return [toks[i] for i in inds]
示例#40
0
def preprocess(tweet):
    abbv_dict = json.load(open("../other/abbreviations.json"))
    emo_lexica_dict = json.load(open("../other/emotions.json"))
    for emoticon in emo_lexica_dict[u'emoticons']:
        abbv_dict[emoticon] = ' '
    for word in emo_lexica_dict[u'words']:
        abbv_dict[word] = ' '
    hash_transformer = Transformer.HashtagTransformer()
    sub_transformer = Transformer.SubstitutionTransformer(abbv_dict)
    preprocessor = Preprocessor([hash_transformer, sub_transformer])
    tweet = ' '.join(tokenize(tweet))
    tweet = preprocessor.transform(tweet)
    return tweet
示例#41
0
def preproc(text):
    newtext = text.lower()
    newtext = replaceURLs(newtext)
    newtext = replaceUserMentions(newtext)
    newtext = replacenonalpha(newtext)
    newtext = replacenumbers(newtext)
    newtext = replaceshort(newtext)
    newtext = replacespace(newtext)
    newtext = newtext.strip()
    newtext = twokenize.tokenize(newtext)
    newtext = removestop(newtext, clean_stop_words)
    newtext = ' '.join(newtext)
    
    return newtext
    def __iter__(self):
        for fname in self.files:
            with gzip.open(fname) as f:
                for line in f:
                    tweet = json.loads(line.strip())

                    text = tweet['text']
                    if self.exclude_rts:
                        if retweet_or_share(text) or tweet['is_rt']:
                            continue
                    text = prepare_text(text)
                    if self.downcase:
                        text = text.lower()
                    yield tokenize(text)
示例#43
0
def twitterTokenizeText(fn, output_fn):
    with open(fn, 'r') as input_file:
        tok_lines = []
        for line in input_file:
            line = line.strip().lower().decode('utf8')
            line = line.replace('`', ' ')
            tok_seq = tokenize(line)
            tok_line = ' '.join(tok_seq)
            tok_lines.append(tok_line)

    with open(output_fn, 'w') as output_file:
        tok_text = '\n'.join(tok_lines)
        print(tok_text.encode('utf8'), file=output_file)
    print('done twitter tokenizing text....')
示例#44
0
def tokenizeTweets(tweets):
    total_tweets = []
    filter_prefix_set = ('@', 'http', 'www')
    # filter for english
    for status in tweets:
        tokenized = tokenize(status)
        # remove http tags and hashtags
        words = [
            re.sub(r'[^\w\s]', '', word).lower() for word in tokenized
            if not word.startswith(filter_prefix_set)
        ]
        if words:
            total_tweets.append(words)

    return total_tweets
def process_line(s, clean_string=True, enable_tags = False):
    """
    Processes a line by iteratively calling process_token.
    """
    if clean_string:
            s = clean_str(s)
    tokens = tokenize(s)
    if enable_tags:
        sent = nltk.pos_tag(tokens)
        chunks = nltk.ne_chunk(sent, binary=False)
        words = []
        for chunk in chunks:
            words += process_chunk(chunk)
        return [w.lower().encode('UTF-8') for w in words]
    else:
        return [process_token(token).lower().encode('UTF-8') for token in tokens]
示例#46
0
def parse_tweets(tweets):
    parsed_tweets =[]
    for tweet_json in tweets:
	try:
	    #tweet_json = json.loads(tweet_str);
	    tweet_text = tweet_json['text'];
	    if u'RT' in tweet_text:	
		tweet_text = tweet_text[0:tweet_text.index(u'RT') -1]
		
	    tweet_token = tk.tokenize(tweet_text)
	    tweet_token =[char_reduction(tok) for tok in tweet_token]
	    tweet_token = [t for tok in tweet_token for t in es.expand(tok) if (not (('@' in t) or (tk.Url_RE.search(t)) or (not emo.Emoticon_RE.search(t) and tk.Punct_re.search(t))))]
	    
	    if tweet_token != []:
		tweet_obj = {"token":tweet_token,"location" : tweet_json['place']['country'] if tweet_json['place'] != None else None,"json":tweet_json,"type" :""}
		parsed_tweets.append(tweet_obj)
	except Exception as e:
	    print e
	    
    return parsed_tweets
示例#47
0
    def read_tweets(self, filename, emo):
        """Read tweets in raw format, returning a list of all tweets in the file"""
        emo_tweets = []
        non_emo_tweets = []
        with codecs.open(filename, encoding='utf8') as tweet_file:
#            tweet = []
            for line in tweet_file:
                data = json.loads(line)
                id = data['tweetid'].strip()
                text = data['text'].strip()
                emotions = data['emotions']
                tokens = tokenize(text)
                incount = 0
                for e in emotions:
                    if e == emo:
                        incount = 1
                if incount == 1:
                    emo_tweets.append(SPACE.join(tokens))
                elif incount == 0:
                    non_emo_tweets.append(SPACE.join(tokens))    
        return emo_tweets, non_emo_tweets
示例#48
0
    def __init__(self, line):
        fields = line.split('","')
        if fields[0] == '"0':
            self.senti = -1
        elif fields[0] == '"2':
            self.senti = 0
        elif fields[0] == '"4':
            self.senti = 1
        self.id = fields[1]
        self.date = fields[2]
        # self.text = fields[5][1:-1]
        self.text = normalization(fields[5][:-1])
        tokens = tokenize(self.text)
        self.tokens = tokens
        tokens_postag = nltk.pos_tag(tokens)
        wordnet_tag = []
        for each_pair in tokens_postag:
            if 'NN' in each_pair[1]:
                wordnet_tag.append((each_pair[0], 'n'))
            if 'JJ' in each_pair[1]:
                wordnet_tag.append((each_pair[0], 'a'))
            elif 'RB' in each_pair[1]:
                wordnet_tag.append((each_pair[0], 'r'))
            elif 'VB' in each_pair[1]:
                wordnet_tag.append((each_pair[0], 'v'))

        # lemmatized tokens are lemmatized and lowered
        self.ltoken_tag = []
        for each_pair in wordnet_tag:
            lword = lemmatizer.lemmatize(each_pair[0], each_pair[1])
            self.ltoken_tag.append((lword.lower(), each_pair[1]))

        self.tweet_senti_score = []

        for each_pair in self.ltoken_tag:
            each_score = sentiextractor.get_score(each_pair)
            if abs(each_score) > 0.02:
                self.tweet_senti_score.append(each_score)
            else:
                self.tweet_senti_score.append(0)
示例#49
0
def analyze_tweets(tweets):
    'keep what is necessary'
    first_candidates = []
    sys.path.append("../libs/tweetmotif/")
    import twokenize

    for tweet in tweets:
        text = tweet['text']
        word_list = twokenize.tokenize(text)
        if '#socrates' in word_list:
            first_candidates.append(tweet)
    
    # filter out tweet not published today.
    import datetime
    second_candidates = []
    for first_cand in first_candidates:
        created_at = first_cand['created_at']
        orthodoxized_time = orthodoxize_time(created_at, "%Y%m%d")
        today = datetime.date.today().strftime("%Y%m%d")
        if orthodoxized_time == today:
            second_candidates.append(first_cand)
    
    return second_candidates
示例#50
0
def run_zmp(clf, port, preprocess=False, verbose=False):
    '''Classify data coming from a ZMQ socket, reply to each request with the
    result.
    '''
    context = zmq.Context()
    socket = context.socket(zmq.REP)
    address = 'tcp://*:' + str(port)
    socket.bind(address)
    if verbose:
        print('ZMQ Service Running: on %s' % (address,))

    while True:
        #  Wait for next request from client
        message = socket.recv()
        # preprocess
        if preprocess:
            message = twokenize.tokenize(message)
            message = twokenize.preprocess(message)
        # check for empty message
        if not message:
            socket.send(str(default_class))
        # classify and reply
        else:
            socket.send(str(clf.predict([message])[0]))
示例#51
0
    def __init__(self, text):
        self.text = normalization(text)
        tokens = tokenize(self.text)
        self.tokens = tokens
        tokens_postag = nltk.pos_tag(tokens)
        wordnet_tag = []
        for each_pair in tokens_postag:
            if 'NN' in each_pair[1]:
                wordnet_tag.append((each_pair[0], 'n'))
            if 'JJ' in each_pair[1]:
                wordnet_tag.append((each_pair[0], 'a'))
            elif 'RB' in each_pair[1]:
                wordnet_tag.append((each_pair[0], 'r'))
            elif 'VB' in each_pair[1]:
                wordnet_tag.append((each_pair[0], 'v'))
        self.not_words = set(['not', 'don\'t', 'dont', 'didn\'t', 'didnt', 'doesn\'t',
                              'doesnt', 'no', 'never', 'isn\'t', 'isnt', 'cant', 'can\'t', 'cannot', 'wasnt', 'wasn\'t', 'weren\'t', 'werent', 'couldn\'t'])

        # lemmatized tokens are lemmatized and lowered
        self.ltoken_tag = []
        for each_pair in wordnet_tag:
            lword = lemmatizer.lemmatize(each_pair[0], each_pair[1])
            self.ltoken_tag.append((lword.lower(), each_pair[1]))

        self.tweet_senti_score = []

        for i in xrange(len(self.ltoken_tag)):
            each_score = sentiextractor.get_score(self.ltoken_tag[i])
            # print each_score
            if abs(each_score) > 0.02:
                if i>0 and self.ltoken_tag[i-1][0] in self.not_words or i>1 and self.ltoken_tag[i-2][0] in self.not_words:
                    self.tweet_senti_score.append(-each_score)
                else:
                    self.tweet_senti_score.append(each_score)
            else:
                self.tweet_senti_score.append(0)
                entityMap[entity] = i
                i += 1

dict2label = {}
for line in open('%s/hbc/data/dict-label3' % (BASE_DIR)):
        (dictionary, label) = line.rstrip('\n').split(' ')
        dict2label[dictionary] = label

nLines = 1
#tweet ='Accident - A4 Great West Rd about 100 M from Syon Lane near Gillette Cnr  - Road has been opened with lane restricted.'+'\n'+'Correction Accident - A4 Great West Rd at Syon Lane (Gillette Cnr) Road was closed w/b, which has been opened with a lane one restriction.' 
#tweet ='Shepherds Bush Green remains closed for gasworks from Holland Park RBT, diversions in place - expect peak period delays (espec. Holland Rd)\nShepherds Bush Green is closed towards King Street College due to roadworks between Holland Rd Roundabout and West 12 Shopping Centre\nR.I.P to the possum on Holland Rd that car crushed you.Kroger making a comeback the one on holland rd gon be nice as hell\n@TeaQ09 @Moe_Diesel_Baby  Whats going on in wagener on new holland rd????'
#tweet='The A4 Ellesmere Rd has reopened at Sutton Court Rd following the earlier collision. Residual Qs remain back to junction 2 on the M4'
tweet =sys.stdin.readline().strip()
line = tweet.encode('utf-8')
while line:
        words = twokenize.tokenize(line)
        seq_features = []
        tags = []

        goodCap = capClassifier.Classify(words) > 0.9

        if posTagger:
                pos = posTagger.TagSentence(words)
                pos = [p.split(':')[0] for p in pos]  # remove weights   
        else:
                pos = None

        # Chunking the tweet
        if posTagger and chunkTagger:
                word_pos = zip(words, [p.split(':')[0] for p in pos])
                chunk = chunkTagger.TagSentence(word_pos)
示例#53
0
BASE_DIR = 'twitter_nlp.jar'

sys.path.append('%s/hbc/python' % (BASE_DIR))
sys.path.append('%s/python' % (BASE_DIR))

from LdaFeatures import LdaFeatures
from twokenize import tokenize

prevText = None
for line in sys.stdin:
    line = line.rstrip('\n')
    fields = line.split('\t')

    sid    = fields[0]
    text   = fields[6]
    words  = tokenize(text)
    confidence = 1.0 / float(fields[-1])
    eType  = fields[-2]
    entity = fields[-3]
    neTags = fields[-4].split(' ')
    pos    = fields[-5].split(' ')
    words  = fields[-6].split(' ')

    #Just skip duplicate texts (will come from tweets with more than one entiity)
    if prevText and prevText == text:
        continue
    prevText = text

    features = LdaFeatures(words, neTags, windowSize=int(options.windowSize))
    for i in range(len(features.entities)):
        entity =  ' '.join(features.words[features.entities[i][0]:features.entities[i][1]])
示例#54
0
def parseOneTweet(line):
    words = twokenize.tokenize(line)
    seq_features = []
    tags = []

    goodCap = capClassifier.Classify(words) > 0.9

    if posTagger:
        pos = posTagger.TagSentence(words)
        #pos = [p.split(':')[0] for p in pos]  # remove weights   
        pos = [re.sub(r':[^:]*$', '', p) for p in pos]  # remove weights   
    else:
        pos = None

    # Chunking the tweet
    if posTagger and chunkTagger:
        word_pos = zip(words, [p.split(':')[0] for p in pos])
        chunk = chunkTagger.TagSentence(word_pos)
        chunk = [c.split(':')[0] for c in chunk]  # remove weights      
    else:
        chunk = None

    #Event tags
    if posTagger and eventTagger:
        events = eventTagger.TagSentence(words, [p.split(':')[0] for p in pos])
        events = [e.split(':')[0] for e in events]
    else:
        events = None

    quotes = Features.GetQuotes(words)
    for i in range(len(words)):
        features = fe.Extract(words, pos, chunk, i, goodCap) + ['DOMAIN=Twitter']
        if quotes[i]:
            features.append("QUOTED")
        seq_features.append(" ".join(features))
    ner.stdin.write(("\t".join(seq_features) + "\n").encode('utf8'))
        
    for i in range(len(words)):
        tags.append(ner.stdout.readline().rstrip('\n').strip(' '))

    features = LdaFeatures(words, tags)

    #Extract and classify entities
    for i in range(len(features.entities)):
        type = None
        wids = [str(vocab.GetID(x.lower())) for x in features.features[i] if vocab.HasWord(x.lower())]
        if llda and len(wids) > 0:
            entityid = "-1"
            if entityMap.has_key(features.entityStrings[i].lower()):
                entityid = str(entityMap[features.entityStrings[i].lower()])
            labels = dictionaries.GetDictVector(features.entityStrings[i])

            if sum(labels) == 0:
                labels = [1 for x in labels]
            llda.stdin.write("\t".join([entityid, " ".join(wids), " ".join([str(x) for x in labels])]) + "\n")
            sample = llda.stdout.readline().rstrip('\n')
            labels = [dict2label[dictMap[int(x)]] for x in sample[4:len(sample)-8].split(' ')]

            count = {}
            for label in labels:
                count[label] = count.get(label, 0.0) + 1.0
            maxL = None
            maxP = 0.0
            for label in count.keys():
                p = count[label] / float(len(count))
                if p > maxP or maxL == None:
                    maxL = label
                    maxP = p

            if maxL != 'None':
                tags[features.entities[i][0]] = "B-%s" % (maxL)
                for j in range(features.entities[i][0]+1,features.entities[i][1]):
                    tags[j] = "I-%s" % (maxL)
            else:
                tags[features.entities[i][0]] = "O"
                for j in range(features.entities[i][0]+1,features.entities[i][1]):
                    tags[j] = "O"
        else:
            tags[features.entities[i][0]] = "B-ENTITY"
            for j in range(features.entities[i][0]+1,features.entities[i][1]):
                tags[j] = "I-ENTITY"

    output = ["%s/%s" % (words[x], tags[x]) for x in range(len(words))]
    if pos:
        output = ["%s/%s" % (output[x], pos[x]) for x in range(len(output))]
    if chunk:
        output = ["%s/%s" % (output[x], chunk[x]) for x in range(len(output))]
    if events:
        output = ["%s/%s" % (output[x], events[x]) for x in range(len(output))]
    return " ".join(output)
def tokenize(tweet):
    tweet = tweet.replace("\\n", " \\n ")
    tweet = tweet.replace("@", " @")
    tokens = twokenize.tokenize(tweet)
    return split_contractions(tokens)
#!/usr/bin
# Tokenize text using twokenize
# Francisco Guzman
import sys
sys.path.append('third-party')
import twokenize as tok



for line in sys.stdin:
	tokenized = tok.tokenize(line.decode("utf-8",'ignore'))
	print u" ".join( tokenized).encode("utf-8",'ignore')


示例#57
0
def Geo_C(intput):
    OneCoord = r'([-+]?\d{1,3}\.\d{3,})'
    Separator = r', ?'
    LatLong = re.compile(OneCoord + Separator + OneCoord, re.U)

    for raw, tweet in iterate(raw=True, inputList=intput):
      source = lookup(tweet, 'source')
      if "Buoy" in source:
        # print "REJECT BUOY\t" + json.dumps(tweet)
        continue

      n_fol = lookup(tweet, 'user.followers_count') or 0
      n_fri = lookup(tweet, 'user.friends_count') or 0
      if not (n_fol < 1000 and n_fri < 1000):
        # print "REJECT FOLLOWERS\t" + json.dumps(lookup(tweet,'user'))
        continue

      text = lookup(tweet, 'text')
      if not text.strip():
        # print "REJECT NO TEXT\t" + json.dumps(record)
        continue

      lat = None
      lon = None
      orig_str = ""

      loc_type = None

      geo = lookup(tweet, 'geo')
      if geo and geo['type'] == 'Point':
        lat, lon = geo['coordinates']
        loc_type = 'OFFICIAL'
      else:
        loc = lookup(tweet, 'user.location').strip()
        if not loc:
          # print "REJECT NO USERLOC\t" + json.dumps(record)
          continue
        m = LatLong.search(loc.encode('utf8'))
        if not m:
          # print "REJECT NO GEO REGEX\t" + json.dumps(record)
          continue
        lat, lon = m.groups()
        loc_type = 'REGEX'

      lat = float(lat); lon = float(lon)
      if (lat, lon) == (0, 0) or lat < -90 or lat > 90 or lon < -180 or lon > 180:
        # print "REJECT JUNK GEO\t" + json.dumps([lat,lon]) + "\t" + json.dumps(record)
        continue

      # # For our applications we usually want to kill retweets
      if lookup(tweet, 'retweeted_status'):
        # print "REJECT OFFICIAL RT\t" + json.dumps(text)
        continue
      toks = twokenize.tokenize(text)
      if any(tok == 'RT' for tok in toks):
        # print "REJECT TEXT RT\t" + json.dumps(text)
        continue

      # Build a "SmallTweet" format record
      record = {
          'id': lookup(tweet, 'id'),
          'user': lookup(tweet, 'user.screen_name'),
          'date': tweet['created_at_datetime'].strftime("%Y-%m-%dT%H:%M:%S"),
          'text': lookup(tweet, 'text')
      }

      record['lonlat'] = [lon, lat]

      if '\t' in record['user']:
        print >> sys.stderr, "WTF\t" + json.dumps(record)
        continue



      out = [
          # 'GEO ' + loc_type,
#          str(record['id']),
#          record['user'].encode('utf-8'),
#          record['date'].encode('utf-8'),
          str(record['lonlat'][0]) + '+' + str(record['lonlat'][1])
#          record['text'].encode('utf-8')
          # json.dumps(lookup(tweet, 'user.location')),
          # json.dumps(lookup(tweet, 'source')),
          # json.dumps(record),
      ]

#      TempVar = record

#      print '\t'.join(out)

      return '\t'.join(out)
 def _tokenize(self, tweet):
   t = twokenize.tokenize(tweet)
   return t