Python tokenize示例，twokenize.tokenize Python示例

示例#1

0

显示文件

def process_semeval(data_folder, outfname, idfname):
    """
    Process dataset of SemEval 2013 task 2 subtask B
    Only keep postive and negative tweets
    Format: tweet|label,user_id,split\n
    """
    logger.info("start processing tweets for SemEval 2013 task 2 subtask B")
    tid_uid_map = {}
    with open(idfname, "rb") as f:
        for line in f:  
            parts = line.strip().split()
            tid_uid_map[parts[0]] = parts[1]

    train_file, val_file, test_file = data_folder[0], data_folder[1], data_folder[2]
    fout = open(outfname, "w")
    with open(train_file, "rb") as f:
        for line in f:  
            parts = line.strip().split("\t")
            label = parts[2]
            tweet = parts[3]
            userid = "unknown"
            if parts[0] in tid_uid_map: userid = tid_uid_map[parts[0]]
            if tweet == "Not Available": continue
            tweet = clean_tweet(tweet)
            tweet = clean_tweet_toks(tokenize(tweet))
            if label == "positive":
                fout.write(tweet + "|1," + userid + ",1\n")
            elif label == "negative":
                fout.write(tweet + "|0," + userid + ",1\n")
    with open(val_file, "rb") as f:
        for line in f:  
            parts = line.strip().split("\t")
            label = parts[2]
            tweet = parts[3]
            userid = "unknown"
            if parts[0] in tid_uid_map: userid = tid_uid_map[parts[0]]
            if tweet == "Not Available": continue
            tweet = clean_tweet(tweet)
            tweet = clean_tweet_toks(tokenize(tweet))
            if label == "positive":
                fout.write(tweet + "|1," + userid + ",2\n")
            elif label == "negative":
                fout.write(tweet + "|0," + userid + ",2\n")
    with open(test_file, "rb") as f:
        for line in f:  
            parts = line.strip().split("\t")
            label = parts[2]
            tweet = parts[3]
            userid = "unknown"
            if parts[0] in tid_uid_map: userid = tid_uid_map[parts[0]]
            if tweet == "Not Available": continue
            tweet = clean_tweet(tweet)
            tweet = clean_tweet_toks(tokenize(tweet))
            if label == "positive":
                fout.write(tweet + "|1," + userid + ",3\n")
            elif label == "negative":
                fout.write(tweet + "|0," + userid + ",3\n")
    fout.close()
    logger.info("finish processing data")

示例#2

0

显示文件

def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Every dataset is lower cased except for TREC
    """
    # string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = twokenize.tokenize(string.lower())
    for i in range(len(string)):
        if string[i][0] == '@':
            string[i] = ""
        elif string[i][0:4] =="http":
            string[i]=""
    string = " ".join(string)


    """
    reducing repeated char
    """
    a=string[0]    
    b=string[1]
    newStr=a+b
    
    for i in range(len(string)-2):
        c=string[i+2]
        if(a==b and b==c):
            pass
        else:
            newStr=newStr+c
        a=b
        b=c
    string = newStr
    
    string = string.replace("`","\'")
    string = string.replace("\u002c",",")
    string = string.replace("\u2019","\'")
    string = string.replace("\\\"\"","\"")
    
    '''
    string = re.sub(r"[^A-Za-z0-9(),!?]", " ", string)     
    
    string = re.sub(r"\'s", " \'s", string) 
    string = re.sub(r"\'ve", " \'ve", string) 
    string = re.sub(r"n\'t", " n\'t", string) 
    string = re.sub(r"\'re", " \'re", string) 
    string = re.sub(r"\'d", " \'d", string) 
    string = re.sub(r"\'ll", " \'ll", string) 
    string = re.sub(r",", " , ", string) 
    string = re.sub(r"!+", " ! ", string) 
    string = re.sub(r"\(", " \( ", string) 
    string = re.sub(r"\)", " \) ", string) 
    string = re.sub(r"\?", " \? ", string) 
    string = re.sub(r"\s{2,}", " ", string)
    '''
    
    string = twokenize.tokenize(string)
    return " ".join(string)

示例#3

0

显示文件

def process_omd(fname, outfname, idfname):
    """
    Process OMD dataset: each tweet is associated with three votings which 1:neg 2:pos
    Only keep postive and negative tweets according two 2/3 agreement
    Format: tweet|label\n
    """
    logger.info("start processing tweets for OMD")
    tid_uid_map = {}
    with open(idfname, "rb") as f:
        for line in f:  
            parts = line.strip().split()
            tid_uid_map[parts[0]] = parts[1]

    fout = open(outfname, "w")
    with open(fname, "rb") as f:
        for line in f:  
            parts = line.strip().split("\t")
            votes = [int(parts[-1]), int(parts[-2]), int(parts[-3])]
            votes.sort()
            if votes[0] == 1 and votes[1] == 1: label = "0"
            elif votes[1] == 2 and (votes[0] == 2 or votes[2] == 2): label = "1"
            else: continue
            tweet = parts[2]
            userid = "unknown"
            if parts[0] in tid_uid_map: userid = tid_uid_map[parts[0]]
            if tweet[0] == "\"": tweet = tweet[1:].strip()
            if tweet[-1] == "\"": tweet = tweet[:-1].strip()
            tweet = clean_tweet(tweet)
            tweet = clean_tweet_toks(tokenize(tweet))
            fout.write(tweet + "|" + label +  "," + userid + "\n")
    fout.close()
    logger.info("finish processing data")

示例#4

0

显示文件

文件： check_lemma_stem_lower.py 项目： Anderbone/CS918NaturalLanguageProcessing

def perprocessing(tdic):
    new_dic = {}
    POS_feature = []
    for line in tdic:
        id = line
        gt = tdic[line][0]
        raw = ' '.join(twokenize.tokenizeRawTweetText(tdic[line][1]))
        text = twokenize.normalizeTextForTagger(raw)
        text_tk = twokenize.tokenize(text)
        # print(text_tk)
        print(text_tk)
        telist = []
        for word in text_tk:
            word = word.lower()
            # ps = nltk.stem.PorterStemmer()
            # word = ps.stem(word)
            telist.append(word)
        # print(telist)
        afterlemma = lemma(telist)
        telist = afterlemma[0]
        POS_feature.append(afterlemma[1])
        # print(telist)
        newtext = ' '.join(telist)
        # print(newtext)
        newtext = textPreprocessor01.replaceall(newtext)  #now preprocess . change to URLINK SADFACE
        print(newtext)
        new_dic[id] = gt, newtext
    return new_dic, np.array(POS_feature)

示例#5

0

显示文件

文件： cluster.py 项目： driscoll/cluster

def kshinglize(s, k=KSHINGLES, stopwords=STOPWORDS):
    """ Tokenizes string s, removes stopwords, and returns a set of k-shingles
    """
    s = s.strip().lower()
    tokens_raw = twokenize.tokenize(s)
    tokens = filterstopwords(tokens_raw, stopwords)
    return tokens_to_kshingles(tokens, k)

示例#6

0

显示文件

 def __init__(self, testData):
     self.labeledTweets = []
     for line in open(testData):
         line = line.rstrip('\n')
         fields = line.split('\t')
         fields[6] = ' '.join(twokenize.tokenize(fields[6]))
         self.labeledTweets.append(fields)

示例#7

0

显示文件

def prepare_and_tokenize(text,
                         url_scheme='st',
                         strip_word_padding=False,
                         alphanumeric_only=False):
    """ 
        Prepares the raw tweet text for tokenisation, then tokenizes it using twokenize (https://github.com/ianozsvald/ark-tweet-nlp-python)
    """
    #handle URLS
    # possible schemes 'st': single token, 'leave'
    if url_scheme != 'leave':
        text = re.sub(url_re, "hyperlinktoken", text)
    text = re.sub(mention_re, "mentiontoken", text)

    # reduce extended words to a shorter token. e.g. "reeeeeeeeeee"->"ree", "hahahaha" -> "haha"
    if strip_word_padding:
        #TODO: Make this actually work
        text = re.sub(word_pad_re, "", text)

    # strip out non AN characters (except # and @)
    if alphanumeric_only:
        text = re.sub(alphanumeric_only_re, "", text)

    # add spaces between emoji
    for match in list(set(emoji_re.findall(text))):
        text = text.replace(match, " " + match + " ")

    return tokenize(text)

示例#8

0

显示文件

文件： __init__.py 项目： samiroid/utils

def preprocess(m, sep_emoji=False):
    m = m.lower()    
    m = max_reps(m)
    #replace user mentions with token '@user'
    user_regex = r".?@.+?( |$)|<@mention>"    
    m = re.sub(user_regex," @user ", m, flags=re.I)
    #replace urls with token 'url'
    m = re.sub(twokenize.url," url ", m, flags=re.I)        
    tokenized_msg = ' '.join(twokenize.tokenize(m)).strip()
    if sep_emoji:
        #tokenize emoji, this tokenzier however has a problem where repeated punctuation gets separated e.g. "blah blah!!!"" -> ['blah','blah','!!!'], instead of ['blah','blah','!','!','!']
        m_toks = tokenized_msg.split()
        n_toks = twk.tokenize(tokenized_msg)         
        if len(n_toks)!=len(m_toks):
            #check if there is any punctuation in this string
            has_punct = map(lambda x:x in twk.punctuation, n_toks)
            if any(has_punct):  
                new_m = n_toks[0]
                for i in xrange(1,len(n_toks)):
                    #while the same punctuation token shows up, concatenate
                    if has_punct[i] and has_punct[i-1] and (n_toks[i] == n_toks[i-1]):
                        new_m += n_toks[i]
                    else:
                        #otherwise add space
                        new_m += " "+n_toks[i]                   
                tokenized_msg = new_m                
    return tokenized_msg.lstrip()

示例#9

0

显示文件

文件： getTaggedFile.py 项目： h4x0rsz/senior-design

def main(argv):

    if len(sys.argv) != 3:
        print("Usage:> python getTaggedFile.py infile.txt outfile.txt")
        exit()

    infile_name = str(sys.argv[1])
    outfile_name = str(sys.argv[2])

    infile = open(infile_name, 'r')
    outfile = open(outfile_name, 'w')

    tagger = PerceptronTagger()

    print("Reading file...")
    line = infile.readline()

    while line != '':
        # Use Twokenizer for twitter parser
        tagset = None
        tokens = tokenize(line)
        tags = nltk.tag._pos_tag(tokens, tagset, tagger)
        outfile.write(format_tagged(tags))
        line = infile.readline()

    # close file and connection
    infile.close()
    outfile.close()
    print("Finished tagging... Closing files.")

示例#10

0

显示文件

文件： cap_classifier.py 项目： 52nlp/twitter_nlp

    def Extract(self, text):
        features = []
        words = twokenize.tokenize(text)

        #hand-crafted features
        iCapitalized = True
        nCapitalized = 0.1
        nAllCaps = 0.1
        nCapLowerViolated = 0.1
        nCapUpperViolated = 0.1
        nWords = 0.1
        for i in range(len(words)):
            capitalized = re.search(r'^([A-Z]|[a-z][A-Z])', words[i])

            if capitalized and not (i == 0 or re.match(r"\.|\?|!|@.+|http:.+|:|\"", words[i-1])):
                nCapitalized += 1.0

            if not (i == 0 or re.match(r"\.|\?|!|@.+|http:.+|:|\"", words[i-1])):
                if capitalized and self.capDict.get(words[i].lower(), '1') != '1':
                    nCapUpperViolated += 1.0
                    features.append(self.fVocab.GetID('upperViolated=%s' % words[i].lower()))
                elif not capitalized and re.match(r'[a-z]+', words[i]) and self.capDict.get(words[i].lower(), '1') != '0':
                    nCapLowerViolated += 1.0
                    #features.append(self.fVocab.GetID('lowerViolated=%s' % words[i].lower()))
                if re.match(r'\w+', words[i][0:1]):
                    nWords += 1
            if re.match(r"i|i'm|im|u", words[i]):
                iCapitalized = False
            if re.match(r"[A-Z]{2,}", words[i]):
                nAllCaps += 1
                
        features.append(self.fVocab.GetID('iCapitalized=%s' % iCapitalized))

        return ' '.join(["%s:1" % x for x in features]) + " %s:%s" % (self.fVocab.GetID('nAllCaps'), nAllCaps/nWords) + " %s:%s" % (self.fVocab.GetID('nCapitalized'), nCapitalized/nWords) + " %s:%s" % (self.fVocab.GetID('nCapLowerViolated'), nCapLowerViolated/nWords) + " %s:%s" % (self.fVocab.GetID('nCapUpperViolated'), nCapUpperViolated/nWords)

示例#11

0

显示文件

文件： predict_psycho-demographics_deceptions.py 项目： FineTear/Twitter-Deceptive-Information-Predictor

    def take_into_account_negation(self, tweet):
        neg_pattern = re.compile(
            'never|nothing|nowhere|noone|none|not|havent|hasnt|hadnt|cant|couldnt|shouldnt|wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint|no|'
            +
            'n\'t|haven\'t|haven\'t|hasn\'t|hadn\'t|can\'t|couldn\'t|shouldn\'t|won\'t|wouldn\'t|don\'t|doesn\'t|didn\'t|isn\'t|aren\'t',
            re.IGNORECASE)
        clause_pattern = re.compile(r'^[.:;!?]$')

        neg = re.search(neg_pattern, tweet)
        if neg != None:
            #print 'Negation in tweet: ' + tweet
            pattern = tweet[neg.start():]
            end = re.search(clause_pattern, pattern)
            if end == None:
                end_str = len(tweet)
            else:
                end_str = end.start()
                end_str = int(end_str) - 1
            negated = ''

            tokens = twokenize.tokenize(pattern[:end_str])
            for w in tokens:
                negated += w + '_neg '
            negated = tweet[:neg.start()] + negated
            #print 'Negation in tweet: ' + negated
        else:
            negated = tweet
        return negated

示例#12

0

显示文件

def preprocess(m, sep_emoji=False):
    m = m.lower()
    m = max_reps(m)
    #replace user mentions with token '@user'
    user_regex = r".?@.+?( |$)|<@mention>"
    m = re.sub(user_regex, " @user ", m, flags=re.I)
    #replace urls with token 'url'
    m = re.sub(twokenize.url, " url ", m, flags=re.I)
    tokenized_msg = ' '.join(twokenize.tokenize(m)).strip()
    if sep_emoji:
        #tokenize emoji, this tokenzier however has a problem where repeated punctuation gets separated e.g. "blah blah!!!"" -> ['blah','blah','!!!'], instead of ['blah','blah','!','!','!']
        m_toks = tokenized_msg.split()
        n_toks = twk.tokenize(tokenized_msg)
        if len(n_toks) != len(m_toks):
            #check if there is any punctuation in this string
            has_punct = map(lambda x: x in twk.punctuation, n_toks)
            if any(has_punct):
                new_m = n_toks[0]
                for i in xrange(1, len(n_toks)):
                    #while the same punctuation token shows up, concatenate
                    if has_punct[i] and has_punct[i - 1] and (
                            n_toks[i] == n_toks[i - 1]):
                        new_m += n_toks[i]
                    else:
                        #otherwise add space
                        new_m += " " + n_toks[i]
                tokenized_msg = new_m
    return tokenized_msg.lstrip()

示例#13

0

显示文件

文件： vectorize_test.py 项目： enewe101/stance-demo

    def build_dict(self, corpus, word=True, which_grams=None):
        """
        Builds the necessary ngrams out of the corpus
        Word is set to True by default which builds word ngrams
        If set to False, will build character ngrams
        Which_grams is the n values of the ngrams. By default will
        create unigrams, bigrams, and trigrams for words and
        bigrams, trigrams, four-grams and five-grams for characters
        """
        dct = UnigramDictionary()
        if word:
            which_grams = [1,2,3]
        else:
            which_grams = [2,3,4,5]

        for text in corpus:
            if word:
                tokens = twokenize.tokenize(text)
            else:
                tokens = list(text)
            
            #list of tokens, each index is the zipped object of 
                        # tokens for the given n
            all_tokens = [ find_ngrams(tokens, n) for n in which_grams ]

            for j in all_tokens:
                for token in j:
                    dct.add(token)

        if word:
            self.word_ngram = dct
        else:
            self.char_ngram = dct

        return dct

示例#14

0

显示文件

def perprocessing(tdic):
    new_dic = {}
    for line in tdic:
        id = line
        gt = tdic[line][0]
        raw = ' '.join(twokenize.tokenizeRawTweetText(tdic[line][1]))
        text = twokenize.normalizeTextForTagger(raw)
        text_tk = twokenize.tokenize(text)
        telist = []
        for word in text_tk:
            word = word.lower()
            ps = nltk.stem.PorterStemmer()
            word = ps.stem(word)
            # word = nltk.stem.SnowballStemmer(word)
            telist.append(word)
        # 	return ''.join(ans)
        # newtext = ?telist
        # newtext = ' '.join(text_tk)
        newtext = ' '.join(telist)
        # print(newtext)
        newtext = textPreprocessor01.replaceall(newtext)
        new_dic[id] = gt, newtext
        # print(type(tdic[line][1]))
        # print(line)
        # print(type(line))
        # print(type(newtext))
        # print(newtext)
    return new_dic

示例#15

0

显示文件

文件： tagAndLabel.py 项目： h4x0rsz/senior-design

def main(argv):

    tagger = PerceptronTagger()
    tagset = None
    tokens = tokenize(line)
    tags = nltk.tag._pos_tag(tokens, tagset, tagger)
    format_tagged(tags)

示例#16

0

显示文件

文件： search.py 项目： zaycev/n7

 def learn_terms(self, tweets_file_object, learn_lemmas=True, cache_size=1000000):
     reader = csv.reader(tweets_file_object, delimiter=",", quotechar="\"")
     term_freq = Counter()
     term_id_map = dict()
     tweet_vectors = []
     for row in reader:
         tweet_id = int(row[0])
         tweet_text = row[-1]
         terms = [t.lower().encode("utf-8") for t in twokenize.tokenize(tweet_text)]
         if learn_lemmas:
             terms = [self.lmtz.lemmatize(term) for term in terms]
         tweet_sp_vector = []
         counted_ids = []
         for term in terms:
             if term not in term_id_map:
                 term_id = len(term_id_map)
                 term_id_map[term] = term_id
             else:
                 term_id = term_id_map[term]
             if term_id not in counted_ids:
                 term_freq[term_id] += 1
                 counted_ids.append(term_id)
             tweet_sp_vector.append(term_id)
         tweet_vectors.append((tweet_id, tweet_sp_vector))
         if len(tweet_vectors) >= cache_size:
             self.write_tweet_vectors(tweet_vectors)
             tweet_vectors = []
     self.write_tweet_vectors(tweet_vectors)
     self.write_terms(term_id_map, term_freq)

示例#17

0

显示文件

    def Extract(self, text):
        features = []
        words = twokenize.tokenize(text)

        #hand-crafted features
        iCapitalized = True
        nCapitalized = 0.1
        nAllCaps = 0.1
        nCapLowerViolated = 0.1
        nCapUpperViolated = 0.1
        nWords = 0.1
        for i in range(len(words)):
            capitalized = re.search(r'^([A-Z]|[a-z][A-Z])', words[i])

            if capitalized and not (i == 0 or re.match(r"\.|\?|!|@.+|http:.+|:|\"", words[i-1])):
                nCapitalized += 1.0

            if not (i == 0 or re.match(r"\.|\?|!|@.+|http:.+|:|\"", words[i-1])):
                if capitalized and self.capDict.get(words[i].lower(), '1') != '1':
                    nCapUpperViolated += 1.0
                    features.append(self.fVocab.GetID('upperViolated=%s' % words[i].lower()))
                elif not capitalized and re.match(r'[a-z]+', words[i]) and self.capDict.get(words[i].lower(), '1') != '0':
                    nCapLowerViolated += 1.0
                    #features.append(self.fVocab.GetID('lowerViolated=%s' % words[i].lower()))
                if re.match(r'\w+', words[i][0:1]):
                    nWords += 1
            if re.match(r"i|i'm|im|u", words[i]):
                iCapitalized = False
            if re.match(r"[A-Z]{2,}", words[i]):
                nAllCaps += 1
                
        features.append(self.fVocab.GetID('iCapitalized=%s' % iCapitalized))

        return ' '.join(["%s:1" % x for x in features]) + " %s:%s" % (self.fVocab.GetID('nAllCaps'), nAllCaps/nWords) + " %s:%s" % (self.fVocab.GetID('nCapitalized'), nCapitalized/nWords) + " %s:%s" % (self.fVocab.GetID('nCapLowerViolated'), nCapLowerViolated/nWords) + " %s:%s" % (self.fVocab.GetID('nCapUpperViolated'), nCapUpperViolated/nWords)

示例#18

0

显示文件

文件： cap_eval.py 项目： 52nlp/twitter_nlp

 def __init__(self, testData):
     self.labeledTweets = []
     for line in open(testData):
         line = line.rstrip('\n')
         fields = line.split('\t')
         fields[6] = ' '.join(twokenize.tokenize(fields[6]))
         self.labeledTweets.append(fields)

示例#19

0

显示文件

文件： decoder.py 项目： BBN-E/Hume

def get_sentences(line, content_type):
    global spacy_en

    if content_type == 'SocialMediaPosting':
        sentences = []
        start_offset = 0
        sent = []

        for token in twokenize.tokenize(line[:-1]):
            idx = line.index(token, start_offset)
            sent.append(Token(token, idx))
            start_offset = idx + len(token)

        sentences.append(sent)
        return sentences

    elif content_type == 'Blog' or content_type == 'NewsArticle' or content_type == 'Post':
        try:
            spacy_doc = spacy_en(line)
        except:
            spacy_en = spacy.load('en')
            print('**** Loaded spacy en')
            spacy_doc = spacy_en(line)

        return spacy_doc.sents

示例#20

0

显示文件

文件： predict_psycho-demographics_deceptions.py 项目： FineTear/Twitter-Deceptive-Information-Predictor

    def extract_more_decep_tech_features(self, tweets, vocab_file):
        #print 'Extracting decep_tech/decep_type features with training vocab'
        train_vocab = {}
        k = 0
        for line in open(vocab_file):
            train_vocab[line.strip()] = k
            k += 1
    #print 'Train vocab size=>' + str(len(train_vocab))

        cv = CountVectorizer(ngram_range=(1, 1),
                             binary=True,
                             vocSuraiyalary=train_vocab)
        train_features_bow = cv.fit_transform(tweets)

        add_decep_tech_matrix = []
        hash_pattern = re.compile('\#+[\w_]+[\w\'_\-]*[\w_]+')
        elong_pattern = re.compile("([a-zA-Z])\\1{2,}")
        caps_pattern = re.compile(('[A-Z][A-Z\d]+'))
        punc_pattern = re.compile('([.,!?]+)')

        for tweet in tweets:
            tweet_vector = []
            tokens = twokenize.tokenize(tweet)
            #count the number of elongated tokens
            n_elong = len(re.findall(elong_pattern, tweet))

            #count the number of all_caps tokens
            n_caps = len(re.findall(caps_pattern, tweet))

            #count the number of repeated punctuation
            n_rep_punct = len(re.findall(punc_pattern, tweet))

            #count the number of hasgtags
            n_hahtag = len(re.findall(hash_pattern, tweet))

            #check if the tweets has SAD, HAPPY, BOTH_SH or NA emoticon
            emoticon_mood = emoticons.analyze_tweet(tweet.strip())
            if emoticon_mood == 'NA':
                emoticon_mood = 0
            elif emoticon_mood == 'HAPPY':
                emoticon_mood = 2
            elif emoticon_mood == 'SAD':
                emoticon_mood = 1
            elif emoticon_mood == 'BOTH_HS':
                emoticon_mood = 4
            tweet_vector = [
                n_elong, n_caps, n_rep_punct, n_hahtag, emoticon_mood
            ]
            add_decep_tech_matrix.append(tweet_vector)

    #print np.asarray(add_decep_tech_matrix).shape
        a = np.asarray(add_decep_tech_matrix)
        #print 'additional 5 features: ' + str(a)

        sa = sparse.csr_matrix(add_decep_tech_matrix)
        features = hstack([sa, train_features_bow])
        #print 'final feature matrix size: ' + str(features.shape)

        return features

示例#21

0

显示文件

文件： PreprocessClass.py 项目： suddu16/Youtube-Comedy-Comparison

	def process(self,text):
		
		tTweet = ""
		for word in text.split():
			if "#" in word:
				word = word.replace("#"," ")
				f=0
				for tt in self.remove:
					if tt in word:
						f=1
				if f==1:
					continue
			tTweet = " ".join([tTweet,word])
			tTweet = tTweet.strip()

		tempTweet = ""
		for word in twokenize.tokenize(tTweet):
			if word != " " and word not in self.stop and not word.isdigit():
				word = word.strip().lower()
				if len(word) > 26:
					word=word[:27]
				#### Normalize Emoticons
				try:
					word = self.emoticons[word]
				except:
					#Normalize Acronyms
					try:
						try:
							if  self.wordDict[word] ==1:
								word = word
						except:
							word = self.acronyms[word]
					except:
					#Normalize Contractions
						try:
							word = self.contractions[word]
						except:
							#Normalize words (Spell)
							try:
								if self.wordDict[word] == 1:
									word =	word
							except:
								CW = self.correct(word)
								if "@" in word or "#" in word:
									word = word
								else:
									if CW != "a":
										word = CW
				if "@" in word:
					word="@user"
				tempTweet = " ".join([tempTweet,word.strip()])
				tempTweet = tempTweet.lower().strip()
		tempTweet = " ".join(stemmer.stem(w) for w in tempTweet.split(" ") if w not in self.stop)
		#print(tempTweet.encode("utf-8"))
		return(tempTweet)

##Usage
# pre = Preprocess()
# pre.process("lol god pls help with my hw :) :(:D")

示例#22

0

显示文件

文件： createDictionaries.py 项目： npow/Ubuntu-Dialogue-Generationv2

def process_line(s, clean_string=True):
    if clean_string:
        s = clean_str(s)
    tokens = tokenize(s)
    #return [process_token(None,token).lower() for token in tokens]
    sent = nltk.pos_tag(tokens)
    chunks = nltk.ne_chunk(sent, binary=False)
    return [process_token(c,token).lower().encode('UTF-8') for c,token in map(None, chunks, tokens)]

示例#23

0

显示文件

文件： twitter_sentence_spliter.py 项目： ssanbu08/Twitter-URL-Corpus

def splitTweet2Sents(tweet):
    tweet = re.sub(u'\u201c', '\"', tweet)
    tweet = re.sub(u'\u201d', '\"', tweet)
    tweet = tweet.encode('ascii', 'ignore')
    tokenizedtweet = u" ".join(tokenize(tweet))
    cleantweet = filterTweetText(tokenizedtweet)
    sents = sentSplitter(cleantweet)
    return sents

示例#24

0

显示文件

def tokenize(text):
    stemmer = PorterStemmer()
    # lmtzr = WordNetLemmatizer()
    tokens = twokenize.tokenize(text)
    tokens_clean = [s for s in tokens if s not in set(string.punctuation)]
    # tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens_clean, stemmer)
    return stems

示例#25

0

显示文件

def preprocess(m):
    m = m.lower()
    m = max_reps(m)
    # replace user mentions with token '@user'
    user_regex = r".?@.+?( |$)|<@mention>"
    m = re.sub(user_regex, " @user ", m, flags=re.I)
    # replace urls with token 'url'
    m = re.sub(twokenize.url, " url ", m, flags=re.I)
    return twokenize.tokenize(m)

示例#26

0

显示文件

文件： pmi.py 项目： zaycev/n7

def all_tokens(tweetreader):
    i = 0
    for r in tweetreader:
        i += 1
        tokens = tokenize(r[-1])
        for t in tokens:
            yield t
        if i >= 50000:
            return

示例#27

0

显示文件

文件： find_testfiles.py 项目： strategist922/Ubuntu-Dialogue-Generationv2

def process_line(s, clean_string=True):
    """
    Processes a line by iteratively calling process_token.
    """
    if clean_string:
        s = clean_str(s)
    tokens = tokenize(s)
    sent = nltk.pos_tag(tokens)
    chunks = nltk.ne_chunk(sent, binary=False)
    return [process_token(c,token).lower().encode('UTF-8') for c,token in map(None, chunks, tokens)]

示例#28

0

显示文件

文件： get_tweets_per_zone_word.py 项目： JeffryLee/urbanSenti

def run_all():

    csv.field_size_limit(sys.maxsize)

    psql_conn = psycopg2.connect("dbname='tweet'")
    psycopg2.extras.register_hstore(psql_conn)
    pg_cur = psql_conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
    
    # Build up the bins-to-nghds mapping so we can easily translate.
    bins_to_nghds = {}
    for line in DictReader(open('point_map.csv')):
        bins_to_nghds[(float(line['lat']), float(line['lon']))] = line['nghd']
    #get nghd to zone mapping
    nghds_to_zones = {}
    for line in DictReader(open('zone_map.csv')):
        nghds_to_zones[line['nghd']] = line['zone']

    words_per_zone = json.load(open('outputs/zone_words.json'))
    top10words = {}
    tweets_per_word = defaultdict(lambda: defaultdict(list))
   
    for zone in words_per_zone:
        top10words[zone] = words_per_zone[zone]["top words"]

    pg_cur.execute("SELECT text, ST_ASGEOJSON(coordinates), user_screen_name " + 
                                                        "FROM tweet_pgh;")
    counter = 0
    for row in pg_cur:
        counter += 1
        if (counter % 10000) == 0:
            print str(counter) + ' tweets processed'
        coords = json.loads(row[1])['coordinates']
        bin = util.util.round_latlon(coords[1], coords[0])
        if bin in bins_to_nghds:
            tweet_nghd = bins_to_nghds[bin]
        else:
            tweet_nghd = 'Outside Pittsburgh'
        if tweet_nghd in nghds_to_zones:
            zone = "Zone " + nghds_to_zones[tweet_nghd]
        else:
            zone = tweet_nghd
        tweet = row[0]
        tweet = tweet.replace('“','"').replace('”','"')
        tweet = unicode(tweet, errors='ignore')
        username = row[2]
        wordList = twokenize.tokenize(tweet)
        wordList = map(lambda x:x.lower(),wordList) 
        for word in top10words[zone]:
            if word in wordList:
                tweets_per_word[zone][word].append(username + ": " + tweet)
   
    print "writing to JSON file"

    with open('outputs/tweets_per_zoneword.json','w') as outfile:
        json.dump(tweets_per_word,outfile, indent=2)

示例#29

0

显示文件

def tokenize_str(istring):
    ostring = []
    for line in istring.split('\n'):
        try:
            ostring.append(u" ".join(twokenize.tokenize(
                line[:])).encode('utf-8'))
        except Exception as e:
            print e
            print line

    return '\n'.join(ostring)

示例#30

0

显示文件

文件： main.py 项目： goddardc/nlp-twitter

 def process_statuses(self, statuses):
     statuses = [twokenize.tokenize(s.text.lower()) for s in statuses]
     for s in xrange(len(statuses)):
         w = 1
         while True:
             if w >= len(statuses[s]):
                 break
             if statuses[s][w][0] == "'":
                 statuses[s] = statuses[s][:w-1] + [statuses[s][w-1] + statuses[s][w]] + statuses[s][w+1:]
                 w = 0
             w += 1
     return statuses

示例#31

0

显示文件

文件： merge_data.py 项目： TPZJJ612/Ubuntu

def get_idx_from_sent(sent, word_idx_map, k):
    """
    Transforms sentence into a list of indices. Pad with zeroes.
    """
    x = []
    words = tokenize(sent)
    for word in words:
        if word in word_idx_map:
            x.append(word_idx_map[word])
        else:
            x.append(word_idx_map[UNK_TOKEN])
    return x

示例#32

0

显示文件

文件： textPreprocessor.py 项目： hadib-z/NLP

def test(text):
    newtext = text.lower()
    newtext = replaceURLs(newtext)
    newtext = replaceUserMentions(newtext)
    newtext = replacenonalpha(newtext)
    newtext = replacenumbers(newtext)
    newtext = replaceshort(newtext)
    newtext = replacespace(newtext)
    newtext = newtext.strip()
    newtext = twokenize.tokenize(newtext)
    
    return newtext

示例#33

0

显示文件

文件： vectorize_test.py 项目： enewe101/stance-demo

    def vectorize(self, text, embeddings=True, ngrams=True):
        """
        Returns the feature vector for a given text
        """
        word_tokens = twokenize.tokenize(text)
        char_tokens = list(text)

        #Don't do anything if the necessary data isnt there
        #if self.char_ngram is None:
        #    print("Missing character n-grams")
        #    return
        #if self.word_ngram is None:
        #    print("Missing word n-grams")
        #    return

        if ngrams:         
            word_features = find_ngram_ft_vec(word_tokens, self.word_ngram)
            char_features = find_ngram_ft_vec(char_tokens, self.char_ngram, which_grams = [2,3,4,5])

        if embeddings:
            local_w_vects = [ self.word_vectors[w] for w in word_tokens if w in self.word_vectors]
            if local_w_vects == []:
                word_embding = self.avg_embd
            else:
                word_embding = csr_matrix(np.mean(local_w_vects, axis=0))

        # total_vector = None
        # count = 0
        # for w in word_tokens:
        #     if w in self.word_vectors:
        #         count += 1
        #         if total_vector is None:
        #             total_vector = self.word_vectors[w]
        #         else:
        #             total_vector += self.word_vectors[w]

        # word_embding = coo_matrix(np.divide(total_vector, count))

        if embeddings and ngrams:
            feature_vect = hstack((word_features, char_features))
            feature_vect = hstack((feature_vect, word_embding))

        elif embeddings:
            feature_vect = word_embding

        elif ngrams:
            feature_vect = hstack((word_features, char_features))

        else:
            print("Do you not want anything?")
        #feature_vect = hstack((feature_vect, word_embding))

        return feature_vect

示例#34

0

显示文件

文件： merge_data.py 项目： BinbinBian/ubottu

def get_idx_from_sent(sent, word_idx_map, k):
    """
    Transforms sentence into a list of indices. Pad with zeroes.
    """
    x = []
    words = tokenize(sent)
    for word in words:
        if word in word_idx_map:
            x.append(word_idx_map[word])
        else:
            x.append(word_idx_map[UNK_TOKEN])
    return x

示例#35

0

显示文件

def tokenize(istring, ostring):
    # print 'this is mytokenizer.py
    ifile = open(istring, 'r')
    ofile = open(ostring, 'w')
    for line in ifile:
        try:
            ofile.write(
                u" ".join(twokenize.tokenize(line[:])).encode('utf-8') + '\n')
        except:
            print line
    ofile.close()
    ifile.close()

示例#36

0

显示文件

def preprocess(tweet):
    abbv_dict = json.load(open("../other/abbreviations.json"))
    emo_lexica_dict = json.load(open("../other/emoticons.json"))
    for emoticon in emo_lexica_dict[u'emoticons']:
        abbv_dict[emoticon] = ' '
    for word in emo_lexica_dict[u'words']:
        abbv_dict[word] = ' '
    hash_transformer = Transformer.HashtagTransformer()
    sub_transformer = Transformer.SubstitutionTransformer(abbv_dict)
    preprocessor = Preprocessor([hash_transformer, sub_transformer])
    tweet = ' '.join(tokenize(tweet))
    tweet = preprocessor.transform(tweet)
    return tweet

示例#37

0

显示文件

文件： bigrams.py 项目： AnnuSachan/tweetmotif

def tokenize_and_clean(msg, alignments):
  if alignments: 
    toks = twokenize.tokenize(msg)
  else:          
    toks = twokenize.simple_tokenize(msg)
  for i in range(len(toks)):
    toks[i] = toks[i].lower()
  inds = range(len(toks))
  #if len(inds) < len(toks): print "dropping junk", sorted(list(toks[i] for i in (set(range(len(toks)))-set(inds))))
  if alignments: 
    return toks.subset(inds)
  else:
    return [toks[i] for i in inds]

示例#38

0

显示文件

文件： __init__.py 项目： umass-semeval/semeval16

def normalize_tweet(text, lowercase=False, rm_digits=False, return_tokens=False):
    if lowercase:
        text = text.lower()
    text = re.sub(URL_PATTERN, 'URL', text)
    tokens = twokenize.tokenize(text)
    if return_tokens:
        if rm_digits:
            tokens = map(lambda tk: re.sub(NUM_PATTERN, 'NUM', tokens))
        return tokens
    clean = ' '.join(tokens)
    if rm_digits:
        re.sub(NUM_PATTERN, 'NUM', clean)
    return clean

示例#39

0

显示文件

文件： bigrams.py 项目： luyang1210/twittertopicsummary

def tokenize_and_clean(msg, alignments):
    if alignments:
        toks = twokenize.tokenize(msg)
    else:
        toks = twokenize.simple_tokenize(msg)
    for i in range(len(toks)):
        toks[i] = toks[i].lower()
    inds = range(len(toks))
    #if len(inds) < len(toks): print "dropping junk", sorted(list(toks[i] for i in (set(range(len(toks)))-set(inds))))
    if alignments:
        return toks.subset(inds)
    else:
        return [toks[i] for i in inds]

示例#40

0

显示文件

文件： Preprocessor.py 项目： i-DAT/emotionannotate

def preprocess(tweet):
    abbv_dict = json.load(open("../other/abbreviations.json"))
    emo_lexica_dict = json.load(open("../other/emotions.json"))
    for emoticon in emo_lexica_dict[u'emoticons']:
        abbv_dict[emoticon] = ' '
    for word in emo_lexica_dict[u'words']:
        abbv_dict[word] = ' '
    hash_transformer = Transformer.HashtagTransformer()
    sub_transformer = Transformer.SubstitutionTransformer(abbv_dict)
    preprocessor = Preprocessor([hash_transformer, sub_transformer])
    tweet = ' '.join(tokenize(tweet))
    tweet = preprocessor.transform(tweet)
    return tweet

示例#41

0

显示文件

文件： textPreprocessor.py 项目： hadib-z/NLP

def preproc(text):
    newtext = text.lower()
    newtext = replaceURLs(newtext)
    newtext = replaceUserMentions(newtext)
    newtext = replacenonalpha(newtext)
    newtext = replacenumbers(newtext)
    newtext = replaceshort(newtext)
    newtext = replacespace(newtext)
    newtext = newtext.strip()
    newtext = twokenize.tokenize(newtext)
    newtext = removestop(newtext, clean_stop_words)
    newtext = ' '.join(newtext)
    
    return newtext

示例#42

0

显示文件

文件： GZIPTweetStream.py 项目： adampoulston/language_resources

    def __iter__(self):
        for fname in self.files:
            with gzip.open(fname) as f:
                for line in f:
                    tweet = json.loads(line.strip())

                    text = tweet['text']
                    if self.exclude_rts:
                        if retweet_or_share(text) or tweet['is_rt']:
                            continue
                    text = prepare_text(text)
                    if self.downcase:
                        text = text.lower()
                    yield tokenize(text)

示例#43

0

显示文件

文件： corpus_util.py 项目： TheElderMindseeker/irproject

def twitterTokenizeText(fn, output_fn):
    with open(fn, 'r') as input_file:
        tok_lines = []
        for line in input_file:
            line = line.strip().lower().decode('utf8')
            line = line.replace('`', ' ')
            tok_seq = tokenize(line)
            tok_line = ' '.join(tok_seq)
            tok_lines.append(tok_line)

    with open(output_fn, 'w') as output_file:
        tok_text = '\n'.join(tok_lines)
        print(tok_text.encode('utf8'), file=output_file)
    print('done twitter tokenizing text....')

示例#44

0

显示文件

文件： twitterstream.py 项目： kclauw/projectDistributed

def tokenizeTweets(tweets):
    total_tweets = []
    filter_prefix_set = ('@', 'http', 'www')
    # filter for english
    for status in tweets:
        tokenized = tokenize(status)
        # remove http tags and hashtags
        words = [
            re.sub(r'[^\w\s]', '', word).lower() for word in tokenized
            if not word.startswith(filter_prefix_set)
        ]
        if words:
            total_tweets.append(words)

    return total_tweets

示例#45

0

显示文件

文件： TextPreprocess.py 项目： pl8787/UbuntuDataGenerator

def process_line(s, clean_string=True, enable_tags = False):
    """
    Processes a line by iteratively calling process_token.
    """
    if clean_string:
            s = clean_str(s)
    tokens = tokenize(s)
    if enable_tags:
        sent = nltk.pos_tag(tokens)
        chunks = nltk.ne_chunk(sent, binary=False)
        words = []
        for chunk in chunks:
            words += process_chunk(chunk)
        return [w.lower().encode('UTF-8') for w in words]
    else:
        return [process_token(token).lower().encode('UTF-8') for token in tokens]

示例#46

0

显示文件

文件： parse.py 项目： siddharthmodala/twittersentiment

def parse_tweets(tweets):
    parsed_tweets =[]
    for tweet_json in tweets:
	try:
	    #tweet_json = json.loads(tweet_str);
	    tweet_text = tweet_json['text'];
	    if u'RT' in tweet_text:	
		tweet_text = tweet_text[0:tweet_text.index(u'RT') -1]
		
	    tweet_token = tk.tokenize(tweet_text)
	    tweet_token =[char_reduction(tok) for tok in tweet_token]
	    tweet_token = [t for tok in tweet_token for t in es.expand(tok) if (not (('@' in t) or (tk.Url_RE.search(t)) or (not emo.Emoticon_RE.search(t) and tk.Punct_re.search(t))))]
	    
	    if tweet_token != []:
		tweet_obj = {"token":tweet_token,"location" : tweet_json['place']['country'] if tweet_json['place'] != None else None,"json":tweet_json,"type" :""}
		parsed_tweets.append(tweet_obj)
	except Exception as e:
	    print e
	    
    return parsed_tweets

示例#47

0

显示文件

文件： Preprocessor.py 项目： i-DAT/emotionannotate

    def read_tweets(self, filename, emo):
        """Read tweets in raw format, returning a list of all tweets in the file"""
        emo_tweets = []
        non_emo_tweets = []
        with codecs.open(filename, encoding='utf8') as tweet_file:
#            tweet = []
            for line in tweet_file:
                data = json.loads(line)
                id = data['tweetid'].strip()
                text = data['text'].strip()
                emotions = data['emotions']
                tokens = tokenize(text)
                incount = 0
                for e in emotions:
                    if e == emo:
                        incount = 1
                if incount == 1:
                    emo_tweets.append(SPACE.join(tokens))
                elif incount == 0:
                    non_emo_tweets.append(SPACE.join(tokens))    
        return emo_tweets, non_emo_tweets

示例#48

0

显示文件

文件： tweetAnalysis.py 项目： alwayforver/demoBasic

    def __init__(self, line):
        fields = line.split('","')
        if fields[0] == '"0':
            self.senti = -1
        elif fields[0] == '"2':
            self.senti = 0
        elif fields[0] == '"4':
            self.senti = 1
        self.id = fields[1]
        self.date = fields[2]
        # self.text = fields[5][1:-1]
        self.text = normalization(fields[5][:-1])
        tokens = tokenize(self.text)
        self.tokens = tokens
        tokens_postag = nltk.pos_tag(tokens)
        wordnet_tag = []
        for each_pair in tokens_postag:
            if 'NN' in each_pair[1]:
                wordnet_tag.append((each_pair[0], 'n'))
            if 'JJ' in each_pair[1]:
                wordnet_tag.append((each_pair[0], 'a'))
            elif 'RB' in each_pair[1]:
                wordnet_tag.append((each_pair[0], 'r'))
            elif 'VB' in each_pair[1]:
                wordnet_tag.append((each_pair[0], 'v'))

        # lemmatized tokens are lemmatized and lowered
        self.ltoken_tag = []
        for each_pair in wordnet_tag:
            lword = lemmatizer.lemmatize(each_pair[0], each_pair[1])
            self.ltoken_tag.append((lword.lower(), each_pair[1]))

        self.tweet_senti_score = []

        for each_pair in self.ltoken_tag:
            each_score = sentiextractor.get_score(each_pair)
            if abs(each_score) > 0.02:
                self.tweet_senti_score.append(each_score)
            else:
                self.tweet_senti_score.append(0)

示例#49

0

显示文件

文件： twitter_monitor.py 项目： chengdujin/socrates

def analyze_tweets(tweets):
    'keep what is necessary'
    first_candidates = []
    sys.path.append("../libs/tweetmotif/")
    import twokenize

    for tweet in tweets:
        text = tweet['text']
        word_list = twokenize.tokenize(text)
        if '#socrates' in word_list:
            first_candidates.append(tweet)
    
    # filter out tweet not published today.
    import datetime
    second_candidates = []
    for first_cand in first_candidates:
        created_at = first_cand['created_at']
        orthodoxized_time = orthodoxize_time(created_at, "%Y%m%d")
        today = datetime.date.today().strftime("%Y%m%d")
        if orthodoxized_time == today:
            second_candidates.append(first_cand)
    
    return second_candidates

示例#50

0

显示文件

文件： sgd.py 项目： lrei/twitter_annotator

def run_zmp(clf, port, preprocess=False, verbose=False):
    '''Classify data coming from a ZMQ socket, reply to each request with the
    result.
    '''
    context = zmq.Context()
    socket = context.socket(zmq.REP)
    address = 'tcp://*:' + str(port)
    socket.bind(address)
    if verbose:
        print('ZMQ Service Running: on %s' % (address,))

    while True:
        #  Wait for next request from client
        message = socket.recv()
        # preprocess
        if preprocess:
            message = twokenize.tokenize(message)
            message = twokenize.preprocess(message)
        # check for empty message
        if not message:
            socket.send(str(default_class))
        # classify and reply
        else:
            socket.send(str(clf.predict([message])[0]))

示例#51

0

显示文件

文件： tweetAnalysis.py 项目： alwayforver/demoBasic

    def __init__(self, text):
        self.text = normalization(text)
        tokens = tokenize(self.text)
        self.tokens = tokens
        tokens_postag = nltk.pos_tag(tokens)
        wordnet_tag = []
        for each_pair in tokens_postag:
            if 'NN' in each_pair[1]:
                wordnet_tag.append((each_pair[0], 'n'))
            if 'JJ' in each_pair[1]:
                wordnet_tag.append((each_pair[0], 'a'))
            elif 'RB' in each_pair[1]:
                wordnet_tag.append((each_pair[0], 'r'))
            elif 'VB' in each_pair[1]:
                wordnet_tag.append((each_pair[0], 'v'))
        self.not_words = set(['not', 'don\'t', 'dont', 'didn\'t', 'didnt', 'doesn\'t',
                              'doesnt', 'no', 'never', 'isn\'t', 'isnt', 'cant', 'can\'t', 'cannot', 'wasnt', 'wasn\'t', 'weren\'t', 'werent', 'couldn\'t'])

        # lemmatized tokens are lemmatized and lowered
        self.ltoken_tag = []
        for each_pair in wordnet_tag:
            lword = lemmatizer.lemmatize(each_pair[0], each_pair[1])
            self.ltoken_tag.append((lword.lower(), each_pair[1]))

        self.tweet_senti_score = []

        for i in xrange(len(self.ltoken_tag)):
            each_score = sentiextractor.get_score(self.ltoken_tag[i])
            # print each_score
            if abs(each_score) > 0.02:
                if i>0 and self.ltoken_tag[i-1][0] in self.not_words or i>1 and self.ltoken_tag[i-2][0] in self.not_words:
                    self.tweet_senti_score.append(-each_score)
                else:
                    self.tweet_senti_score.append(each_score)
            else:
                self.tweet_senti_score.append(0)

示例#52

0

显示文件

文件： extractEntities2.py 项目： fatma-elsafoury/twitter-pos-geotagging

                entityMap[entity] = i
                i += 1

dict2label = {}
for line in open('%s/hbc/data/dict-label3' % (BASE_DIR)):
        (dictionary, label) = line.rstrip('\n').split(' ')
        dict2label[dictionary] = label

nLines = 1
#tweet ='Accident - A4 Great West Rd about 100 M from Syon Lane near Gillette Cnr  - Road has been opened with lane restricted.'+'\n'+'Correction Accident - A4 Great West Rd at Syon Lane (Gillette Cnr) Road was closed w/b, which has been opened with a lane one restriction.' 
#tweet ='Shepherds Bush Green remains closed for gasworks from Holland Park RBT, diversions in place - expect peak period delays (espec. Holland Rd)\nShepherds Bush Green is closed towards King Street College due to roadworks between Holland Rd Roundabout and West 12 Shopping Centre\nR.I.P to the possum on Holland Rd that car crushed you.Kroger making a comeback the one on holland rd gon be nice as hell\n@TeaQ09 @Moe_Diesel_Baby  Whats going on in wagener on new holland rd????'
#tweet='The A4 Ellesmere Rd has reopened at Sutton Court Rd following the earlier collision. Residual Qs remain back to junction 2 on the M4'
tweet =sys.stdin.readline().strip()
line = tweet.encode('utf-8')
while line:
        words = twokenize.tokenize(line)
        seq_features = []
        tags = []

        goodCap = capClassifier.Classify(words) > 0.9

        if posTagger:
                pos = posTagger.TagSentence(words)
                pos = [p.split(':')[0] for p in pos]  # remove weights   
        else:
                pos = None

        # Chunking the tweet
        if posTagger and chunkTagger:
                word_pos = zip(words, [p.split(':')[0] for p in pos])
                chunk = chunkTagger.TagSentence(word_pos)

示例#53

0

显示文件

文件： tweets2entityWords.py 项目： 52nlp/twitter_nlp

BASE_DIR = 'twitter_nlp.jar'

sys.path.append('%s/hbc/python' % (BASE_DIR))
sys.path.append('%s/python' % (BASE_DIR))

from LdaFeatures import LdaFeatures
from twokenize import tokenize

prevText = None
for line in sys.stdin:
    line = line.rstrip('\n')
    fields = line.split('\t')

    sid    = fields[0]
    text   = fields[6]
    words  = tokenize(text)
    confidence = 1.0 / float(fields[-1])
    eType  = fields[-2]
    entity = fields[-3]
    neTags = fields[-4].split(' ')
    pos    = fields[-5].split(' ')
    words  = fields[-6].split(' ')

    #Just skip duplicate texts (will come from tweets with more than one entiity)
    if prevText and prevText == text:
        continue
    prevText = text

    features = LdaFeatures(words, neTags, windowSize=int(options.windowSize))
    for i in range(len(features.entities)):
        entity =  ' '.join(features.words[features.entities[i][0]:features.entities[i][1]])

示例#54

0

显示文件

文件： keyang.py 项目： JinyiLu/EventStructureLearning

def parseOneTweet(line):
    words = twokenize.tokenize(line)
    seq_features = []
    tags = []

    goodCap = capClassifier.Classify(words) > 0.9

    if posTagger:
        pos = posTagger.TagSentence(words)
        #pos = [p.split(':')[0] for p in pos]  # remove weights   
        pos = [re.sub(r':[^:]*$', '', p) for p in pos]  # remove weights   
    else:
        pos = None

    # Chunking the tweet
    if posTagger and chunkTagger:
        word_pos = zip(words, [p.split(':')[0] for p in pos])
        chunk = chunkTagger.TagSentence(word_pos)
        chunk = [c.split(':')[0] for c in chunk]  # remove weights      
    else:
        chunk = None

    #Event tags
    if posTagger and eventTagger:
        events = eventTagger.TagSentence(words, [p.split(':')[0] for p in pos])
        events = [e.split(':')[0] for e in events]
    else:
        events = None

    quotes = Features.GetQuotes(words)
    for i in range(len(words)):
        features = fe.Extract(words, pos, chunk, i, goodCap) + ['DOMAIN=Twitter']
        if quotes[i]:
            features.append("QUOTED")
        seq_features.append(" ".join(features))
    ner.stdin.write(("\t".join(seq_features) + "\n").encode('utf8'))
        
    for i in range(len(words)):
        tags.append(ner.stdout.readline().rstrip('\n').strip(' '))

    features = LdaFeatures(words, tags)

    #Extract and classify entities
    for i in range(len(features.entities)):
        type = None
        wids = [str(vocab.GetID(x.lower())) for x in features.features[i] if vocab.HasWord(x.lower())]
        if llda and len(wids) > 0:
            entityid = "-1"
            if entityMap.has_key(features.entityStrings[i].lower()):
                entityid = str(entityMap[features.entityStrings[i].lower()])
            labels = dictionaries.GetDictVector(features.entityStrings[i])

            if sum(labels) == 0:
                labels = [1 for x in labels]
            llda.stdin.write("\t".join([entityid, " ".join(wids), " ".join([str(x) for x in labels])]) + "\n")
            sample = llda.stdout.readline().rstrip('\n')
            labels = [dict2label[dictMap[int(x)]] for x in sample[4:len(sample)-8].split(' ')]

            count = {}
            for label in labels:
                count[label] = count.get(label, 0.0) + 1.0
            maxL = None
            maxP = 0.0
            for label in count.keys():
                p = count[label] / float(len(count))
                if p > maxP or maxL == None:
                    maxL = label
                    maxP = p

            if maxL != 'None':
                tags[features.entities[i][0]] = "B-%s" % (maxL)
                for j in range(features.entities[i][0]+1,features.entities[i][1]):
                    tags[j] = "I-%s" % (maxL)
            else:
                tags[features.entities[i][0]] = "O"
                for j in range(features.entities[i][0]+1,features.entities[i][1]):
                    tags[j] = "O"
        else:
            tags[features.entities[i][0]] = "B-ENTITY"
            for j in range(features.entities[i][0]+1,features.entities[i][1]):
                tags[j] = "I-ENTITY"

    output = ["%s/%s" % (words[x], tags[x]) for x in range(len(words))]
    if pos:
        output = ["%s/%s" % (output[x], pos[x]) for x in range(len(output))]
    if chunk:
        output = ["%s/%s" % (output[x], chunk[x]) for x in range(len(output))]
    if events:
        output = ["%s/%s" % (output[x], events[x]) for x in range(len(output))]
    return " ".join(output)

示例#55

0

显示文件

文件： twokenize_wrapper.py 项目： mattroweshow/NER-Diff-Paper

def tokenize(tweet):
    tweet = tweet.replace("\\n", " \\n ")
    tweet = tweet.replace("@", " @")
    tokens = twokenize.tokenize(tweet)
    return split_contractions(tokens)

示例#56

0

显示文件

文件： tokenize2.py 项目： Qatar-Computing-Research-Institute/cqa-tokenizer

#!/usr/bin
# Tokenize text using twokenize
# Francisco Guzman
import sys
sys.path.append('third-party')
import twokenize as tok



for line in sys.stdin:
	tokenized = tok.tokenize(line.decode("utf-8",'ignore'))
	print u" ".join( tokenized).encode("utf-8",'ignore')

示例#57

0

显示文件

文件： geo_filter.py 项目： ezeissler90/CSI_431

def Geo_C(intput):
    OneCoord = r'([-+]?\d{1,3}\.\d{3,})'
    Separator = r', ?'
    LatLong = re.compile(OneCoord + Separator + OneCoord, re.U)

    for raw, tweet in iterate(raw=True, inputList=intput):
      source = lookup(tweet, 'source')
      if "Buoy" in source:
        # print "REJECT BUOY\t" + json.dumps(tweet)
        continue

      n_fol = lookup(tweet, 'user.followers_count') or 0
      n_fri = lookup(tweet, 'user.friends_count') or 0
      if not (n_fol < 1000 and n_fri < 1000):
        # print "REJECT FOLLOWERS\t" + json.dumps(lookup(tweet,'user'))
        continue

      text = lookup(tweet, 'text')
      if not text.strip():
        # print "REJECT NO TEXT\t" + json.dumps(record)
        continue

      lat = None
      lon = None
      orig_str = ""

      loc_type = None

      geo = lookup(tweet, 'geo')
      if geo and geo['type'] == 'Point':
        lat, lon = geo['coordinates']
        loc_type = 'OFFICIAL'
      else:
        loc = lookup(tweet, 'user.location').strip()
        if not loc:
          # print "REJECT NO USERLOC\t" + json.dumps(record)
          continue
        m = LatLong.search(loc.encode('utf8'))
        if not m:
          # print "REJECT NO GEO REGEX\t" + json.dumps(record)
          continue
        lat, lon = m.groups()
        loc_type = 'REGEX'

      lat = float(lat); lon = float(lon)
      if (lat, lon) == (0, 0) or lat < -90 or lat > 90 or lon < -180 or lon > 180:
        # print "REJECT JUNK GEO\t" + json.dumps([lat,lon]) + "\t" + json.dumps(record)
        continue

      # # For our applications we usually want to kill retweets
      if lookup(tweet, 'retweeted_status'):
        # print "REJECT OFFICIAL RT\t" + json.dumps(text)
        continue
      toks = twokenize.tokenize(text)
      if any(tok == 'RT' for tok in toks):
        # print "REJECT TEXT RT\t" + json.dumps(text)
        continue

      # Build a "SmallTweet" format record
      record = {
          'id': lookup(tweet, 'id'),
          'user': lookup(tweet, 'user.screen_name'),
          'date': tweet['created_at_datetime'].strftime("%Y-%m-%dT%H:%M:%S"),
          'text': lookup(tweet, 'text')
      }

      record['lonlat'] = [lon, lat]

      if '\t' in record['user']:
        print >> sys.stderr, "WTF\t" + json.dumps(record)
        continue



      out = [
          # 'GEO ' + loc_type,
#          str(record['id']),
#          record['user'].encode('utf-8'),
#          record['date'].encode('utf-8'),
          str(record['lonlat'][0]) + '+' + str(record['lonlat'][1])
#          record['text'].encode('utf-8')
          # json.dumps(lookup(tweet, 'user.location')),
          # json.dumps(lookup(tweet, 'source')),
          # json.dumps(record),
      ]

#      TempVar = record

#      print '\t'.join(out)

      return '\t'.join(out)

示例#58

0

显示文件

文件： tweetment.py 项目： itisha07/Twitter-Sentiment-Analysis

 def _tokenize(self, tweet):
   t = twokenize.tokenize(tweet)
   return t