def clean_tokenize_stem_tweets_baseline(raw_tweets, token_file, tagging_file, force_new=False):
    if os.path.isfile(token_file) and os.path.isfile(tagging_file) and not force_new:
        print('reading tokens and tagging results')
        with open(token_file, 'rb') as inf:
            token_lists = pickle.load(inf)
        with open(tagging_file, 'rb') as inf:
            tagging_results = pickle.load(inf)
        if len(token_lists) == raw_tweets.shape[0] and len(tagging_results) == raw_tweets.shape[0]:
            print('tokens and tagging results are correct')
            return token_lists, tagging_results
    print('preform new tokenization and tagging')
    # or create new one
    stemmer = PorterStemmer()
    url_compiled = re.compile(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))')
    mention_compiled = re.compile(r'@\w*')
    html_compiled = re.compile(r"&#?\w+;")
    tweets_cleaned = [html_compiled.sub(' ', tweet) for tweet in raw_tweets]
    tweets_cleaned = [url_compiled.sub(' ', tweet) for tweet in tweets_cleaned]
    tweets_cleaned = [mention_compiled.sub(' ', tweet) for tweet in tweets_cleaned]
    tagging_results = CMUTweetTagger.runtagger_parse(tweets_cleaned)
    token_lists = [[stemmer.stem(word_result[0]) for word_result in tweet_result] for tweet_result in tagging_results]
    # save to file
    with open(token_file, 'wb') as outf:
        pickle.dump(token_lists, outf)
    with open(tagging_file, 'wb') as outf:
        pickle.dump(tagging_results, outf)
    return token_lists, tagging_results
Пример #2
0
def boost_df(X):
    boost_entity = {}
    boosted_wdfVoc = {}
    Xdense = np.matrix(X).astype('float')
    X_scaled = preprocessing.scale(Xdense)
    X_normalized = preprocessing.normalize(X_scaled, norm='l2')
    vocX = vectorizer.get_feature_names()

    pos_tokens = CMUTweetTagger.runtagger_parse(
        [term.upper().encode() for term in vocX])

    for l in pos_tokens:
        term = ''
        for gr in range(0, len(l)):
            term += l[gr][0].lower() + " "
        if "^" in str(l):
            boost_entity[term.strip()] = 2.5
        else:
            boost_entity[term.strip()] = 1.0

    dfX = X.sum(axis=0)
    keys = vocX
    vals = dfX
    for k, v in zip(keys, vals):
        dfVoc[k] = v
    for k in dfVoc:
        try:
            boosted_wdfVoc[k] = dfVoc[k] * boost_entity[k]
        except:
            boosted_wdfVoc[k] = dfVoc[k]

    return X_normalized, boosted_wdfVoc
Пример #3
0
    def pos_tagger(self):
        tweets = []
        for tw in self.tweet_original:
            try:
                tw = tw.decode('unicode_escape').encode('ascii','ignore')
            except:
                tw = re.sub(r'\\+', '', tw)
                tw = tw.decode('unicode_escape').encode('ascii','ignore')
            tweets.append(tw)

        # tweets = [tw.encode('utf8') for tw in self.tweet_original[:3]]
        sent_tags = CMUTweetTagger.runtagger_parse(tweets)
        # fil_tweet = open('tweet_tags.json','w')
        i = 0
        for sent in sent_tags:
            unigrams = [tag_tuple[1] for tag_tuple in sent]
            bigrams = set(nltk.bigrams(unigrams))
            trigrams = set(nltk.trigrams(unigrams))
            self.tweet_unigram[self.tweet_id[i]] = set(unigrams)
            self.tweet_bigram[self.tweet_id[i]] = bigrams
            self.tweet_trigram[self.tweet_id[i]] = trigrams

            self.tweet_feature_list.extend(unigrams)
            self.tweet_feature_list.extend(bigrams)
            self.tweet_feature_list.extend(trigrams)

            i += 1
        #json.dump(self.tweet_unigram,fil_tweet)
        self.tweet_feature_list = list(set(self.tweet_feature_list))
Пример #4
0
def genPOSTags(text_path, pos_path, pos_corpus_file, verbose=True):
    with open(os.path.join(param.dump_folder, text_path), "rb") as handle:
        comment_list, _ = pickle.load(handle)
    # tokenize
    #comment_list = genTokens(comment_list)
    # POS tagging
    pos_comment_list = []
    max_sent_num = 5000
    ind = 0
    while (ind < len(comment_list)):
        sent_list = comment_list[ind:ind + max_sent_num]
        #[" ".join(seq) for seq in comment_list[ind: ind+max_sent_num]]
        tok_sent_list = [
            sent.lower()
            for sent in CMUTweetTokenizer.runtokenizer_parse(sent_list)
        ]
        raw_pos_list = CMUTweetTagger.runtagger_parse(tok_sent_list)
        pos_list = []
        for raw_seq in raw_pos_list:
            seq = [tup[1] for tup in raw_seq]
            pos_list.append(seq[:])
        pos_comment_list = pos_comment_list + pos_list
        ind += max_sent_num
    with open(os.path.join(param.dump_folder, pos_path), "wb") as handle:
        pickle.dump(pos_comment_list, handle)
    if text_path.endswith("comm.data"):
        pos_corpus_file.write("\n".join(
            [" ".join(pos_comm) for pos_comm in pos_comment_list]))
        pos_corpus_file.write("\n")
    if verbose:
        print("# pos sequences:", len(pos_comment_list))
        for i in range(3):
            print("example of pos sequence:", pos_comment_list[i])
        print("save pos to {}".format(pos_path))
Пример #5
0
def update_edges_tag(database):
    tweets = [u"someone is cold game nd he needs to follow me",
          u"only 3mths left in school . i wil always mis my skull , frnds and my teachrs"]


    lot = CMUTweetTagger.runtagger_parse(tweets)

    norm = normalizer.Normalizer(lot,database)
#    tags = norm.nodes.distinct('tag')
    tags = [u'A',
            u'N',
            u'^',
            u'V',
            u'!',
            u'O',
            u'G',
            u'S',
            u'R',
            u',',
            u'P',
            u'Z',
            u'L',
            u'D',
            u'&',
            u'T',
            u'X',
            u'Y',
            u'M']
    for tag in tags:
        nouns = [node['node'] for node in filter(lambda x: x['freq']> 8, norm.nodes.find({'tag':tag}))]
        norm.edges.update({'from': { '$in' : nouns}},{'$set' : {u'from_tag':tag } },multi=True)
        norm.edges.update({'to': { '$in' : nouns}},{'$set' : {u'to_tag':tag } },multi=True)
Пример #6
0
 def parse_words_by_ark_nlp_batch(self, tweets, preserve_types, ark_run_cmd):
     token_lists = CMUTweetTagger.runtagger_parse(tweets, run_tagger_cmd=ark_run_cmd)
     ret = []
     for tokens in token_lists:
         filtered_tokens = self.filter_by_type(tokens, preserve_types)
         words = self.parse_tokenized_words(filtered_tokens)
         ret.append(words)
     return ret
def pos_tag_tweets(tweet_list,pkl_file = 'pickle/pos_tweets.pkl'):
	
	tagged_tweets = CMUTweetTagger.runtagger_parse(tweet_list, run_tagger_cmd=tweet_stack_command)

	if pkl_file != None:
		serialize_object(tagged_tweets,pkl_file)

	return tagged_tweets
Пример #8
0
def boost_entities(features):
    boost_entity = {}
    pos_tokens = CMUTweetTagger.runtagger_parse([term.upper() for term in features])

    for line in pos_tokens:
        term =''
        for entity in range(len(line)):
            term += line[entity][0].lower() + " "
            if "^" in str(line):
                boost_entity[term.strip()] = 2.5
            else:
                boost_entity[term.strip()] = 1.0
    return boost_entity
Пример #9
0
def tag_tweets(tweets):
    tagResults = CMUTweetTagger.runtagger_parse(tweets)
    result = []
    for tweet in tagResults: #w = word, t = tag, c = confidence level
        tuplesList = []
        for triple in tweet:
            (w, t, c) = triple
            #removing urls, user mentions, numbers, and hashtags from the tweet
            if t != 'U' and t != '@' and t!= '$' and t != '#':
                tuplesList.append((w,t))
        result.append(tuplesList)
        
    return result
Пример #10
0
def tag_tweets(tweets):
    tagResults = CMUTweetTagger.runtagger_parse(tweets)
    result = []
    for tweet in tagResults:  #w = word, t = tag, c = confidence level
        tuplesList = []
        for triple in tweet:
            (w, t, c) = triple
            #removing urls, user mentions, numbers, and hashtags from the tweet
            if t != 'U' and t != '@' and t != '$' and t != '#':
                tuplesList.append((w, t))
        result.append(tuplesList)

    return result
Пример #11
0
def posTagged(tweets):

    taggedData = CMUTweetTagger.runtagger_parse(tweets)
    posTagsPerTweet = []
    for tweet in taggedData:
        sumConf = 0.0
        count = 0
        for tokenTup in tweet:
            sumConf = sumConf + tokenTup[2]
            count = count + 1
        avgConf = 0 if count == 0 else (sumConf/count)
        posTagsPerTweet.append((tweet, avgConf))
    return posTagsPerTweet
Пример #12
0
def run(tweets, slang, not_oov, threshold=1.5, slang_threshold=1,
        max_val = [1., 1., 0.5, 0.0, 1.0, 0.5], distance = 2, oov_fun = ovvFunc):
    pos_tagged = CMUTweetTagger.runtagger_parse(tweets)
    window_size = 7
    matrix1 = calc_score_matrix(pos_tagged, oov_fun, window_size,database='tweets2')
    if not slang:
        slang = tools.get_slangs()
    fms = add_slangs(matrix1,slang)
    fmd = add_from_dict(fms, matrix1, distance, not_oov)
    mapp = construct_mapp(pos_tagged,oov_fun)
    fm_reduced = add_nom_verbs(fmd,mapp ,slang_threshold=slang_threshold)
    feat_mat = iter_calc_lev(matrix1,fm_reduced, not_ovv = not_oov)
    res = calc_results(feat_mat, not_oov, max_val = max_val, threshold = threshold)
    return res
    def resolve(self, original):

        #print 'resolve length: ', len(original)

        data = [self.normalizeKey(twt) for twt in set(original)]

        if enabled_modules['caches'] is not None:
            # Tag all uncached data
            uncached = [twt for twt in data if not self.cache.has_key(twt)]
        else:
            uncached = data

        #print uncached
        #print 'len     : ', len(uncached)
        #print 'uncached: '
        #for twt in uncached: print '\t', twt
        #print '\n\n\n'

        partial = []
        if uncached:
            print 'uncached: ', len(uncached)
            partial = CMUTweetTagger.runtagger_parse(
                uncached, run_tagger_cmd=run_tagger_cmd)
            print 'partial: ', len(partial)

            if enabled_modules['caches'] is not None:
                for twt, tag in zip(uncached, partial):
                    #print 'adding: ', twt
                    self.cache.add_map(twt, tag)

        # Lookup all tags
        if enabled_modules['caches'] is not None:
            tagged = [self.cache.get_map(twt) for twt in data]
        else:
            tagged = partial

        #print 'TAGGED DATA'
        #print tagged

        # Store the data in the object
        self._toks = {}
        self._pos = {}
        for twt, tags in zip(data, tagged):

            # Last step of splitting compund words
            newToks, newTags = self.post_process_tokenize(tags)

            self._toks[twt] = newToks
            self._pos[twt] = newTags
Пример #14
0
def checkTweetNums(tweets, minTweets):
    #number as adjective check
    count = 0
    processedtweets = []
    for line in tweets:
        processedtweets.append(" ".join(wordsegment.segment(line)))
    postags = cmu.runtagger_parse(processedtweets)
    for postag in postags:
        postag = "".join(postag)
        if "$N" in postag or "$^" in postag or "$M" in postag or "$Z" in postag:
            #Checking for Consecutive numbers and Nouns
            count += 1
    if count >= minTweets:
        return 1
    else:
        return 0
Пример #15
0
def checkTweetNums(tweets,minTweets):
	#number as adjective check
	count = 0
	processedtweets = []
	for line in tweets:
		processedtweets.append(" ".join(wordsegment.segment(line)))
	postags = cmu.runtagger_parse(processedtweets)
	for postag in postags:
		postag = "".join(postag)
		if "$N" in postag or "$^" in postag or "$M" in postag or "$Z" in postag:
			#Checking for Consecutive numbers and Nouns
			count += 1
	if count >= minTweets:
		return 1
	else:
		return 0
Пример #16
0
def process_comments(target_list, post_list):
    hits = []
    for comment in post_list:
        hit_ents = []
        content = [comment['content']]
        ents = CMUTweetTagger.runtagger_parse(content)[0];
        for ent in ents:
            enty = ent[0]
            if enty in target_list:
                hit_ents.append(enty)

        if len(hit_ents) > 0:
            comment['hit_ents'] = hit_ents
            hits.append(comment)

    return hits
Пример #17
0
    def __tag(self, cleanedTweets):
        """Tag the cleaned tweets.

        Arguments:
            cleanedTweets {list} -- a list of cleaned tweets

        Returns:
            list -- [[(form, pos, score), ...], [(form, pos, score), ...], ...]
        """
        print("The number of tweets before tagged {}".format(
            len(cleanedTweets)))
        taggedTweets = CMUTweetTagger.runtagger_parse(cleanedTweets)
        print("The number of tweets after tagged {}".format(len(taggedTweets)))
        self.helper.dumpJson(self.fileFolderPath, "tagged_tweets.json",
                             taggedTweets)
        print("tagged_tweets.json has been saved.")
        return taggedTweets
Пример #18
0
def produce_entity_list(data):
    output_dict = {}
    for i in data.index:
        tweet = [data.loc[i, 'content']]
        ents = CMUTweetTagger.runtagger_parse(tweet)[0];
        for ent in ents:
            if ent[0] in output_dict.keys():
                output_dict[ent[0]]['count']+=1
            else:
                if ent[1] in ['N', '^', 'S', 'Z', 'M', 'A']:
                    enty = ent[0]
                    pos = ent[1]
                    output_dict[enty] = {'pos': pos, 'count': 1}

    output_df = pd.DataFrame.from_dict(output_dict, orient = 'index')
    output_df['entity'] =output_df.index
    output_df.index = range(len(output_df))
    return output_df, output_dict
Пример #19
0
def get_tense_vectors(sents):

    tagged_sents = cmu.runtagger_parse(sents)
    feat_vecs = []
    for tagged_sent in tagged_sents:
        feat_vec = []
        # past
        feat_vec.append(
            len([word for word in tagged_sent if word[1] in ['VBD', 'VBN']]))
        # present
        feat_vec.append(
            len([
                word for word in tagged_sent
                if (word[1] in ['VBP', 'VBG', 'VBZ'] or word[0] == 'now')
            ]))
        # future
        feat_vec.append(len([word for word in tagged_sent if word[1] == 'MD']))
        feat_vecs.append(feat_vec)
    return feat_vecs
Пример #20
0
def annotate_pos(tweets, ptb=False):
    if ptb:
        tagger_cmd = RUN_TAGGER_CMD_PTB
    else:
        tagger_cmd = RUN_TAGGER_CMD
    ids = []
    texts = []
    for key, value in tweets.items():
        ids.append(key)
        texts.append(json.dumps({'text':value['text']}))
    pos = CMUTweetTagger.runtagger_parse(texts, run_tagger_cmd=tagger_cmd)
    
    if len(ids) != len(pos):
        raise Exception("Error: Tweet Tagger returned incorrect results") 

    for i in range(0, len(ids)):
        tweets[ids[i]]['pos'] = pos[i]
        tweets[ids[i]]['tokens'] = [tag[0] for tag in pos[i]]

    print pos[0]
    print [tag[0] for tag in pos[0]]
Пример #21
0
    def resolve(self, original):

        #print 'resolve length: ', len(original)

        data = [self.normalizeKey(twt) for twt in set(original)]

        # Tag all uncached data
        uncached = [ twt for twt in data if not self.cache.has_key(twt) ]

        #print uncached
        #print 'len     : ', len(uncached)
        #print 'uncached: '
        #for twt in uncached: print '\t', twt
        #print '\n\n\n'

        if uncached:
            print 'uncached: ', len(uncached)
            partial = CMUTweetTagger.runtagger_parse(uncached)
            print 'partial: ', len(partial)
            for twt,tag in zip(uncached,partial):
                #print 'adding: ', twt
                self.cache.add_map(twt, tag)

        # Lookup all tags
        tagged = [ self.cache.get_map(twt) for twt in data ]

        #print 'TAGGED DATA'
        #print tagged

        # Store the data in the object
        self._toks = {}
        self._pos  = {}
        for twt,tags in zip(data,tagged):

            # Last step of splitting compund words
            newToks,newTags = self.post_process_tokenize(tags)

            self._toks[twt] = newToks
            self._pos[twt]  = newTags
Пример #22
0
    def pos_tagger_writer(self):
        tweets = []
        for tw in self.tweet_original:
            try:
                tw = tw.decode('unicode_escape').encode('ascii','ignore')
            except:
                tw = re.sub(r'\\+', '', tw)
                tw = tw.decode('unicode_escape').encode('ascii','ignore')
            tweets.append(tw)

        # tweets = [tw.encode('utf8') for tw in self.tweet_original[:3]]
        tweet_tags = {}
        sent_tags = CMUTweetTagger.runtagger_parse(tweets)
        fil_tweet = open(self.tagFile,'w')
        i = 0
        for sent in sent_tags:
            is_question = 1 if self.is_question[i] == "yes" else 0
            is_anserable = 1 if self.is_answerable[i] == "yes" else 0
            pos = {"tags":sent, "is_question":is_question, "is_answerable":is_anserable}
            tweet_tags[self.tweet_id[i]] = pos
            i += 1
        json.dump(tweet_tags,fil_tweet)
Пример #23
0
import nltk
import CMUTweetTagger

ex1 = "Before what happened at lunch when someone decided to piss me off.I had so much at the diabetes walk. Especially with my girls #TWERKTEAM!"
#text1 = nltk.word_tokenize(ex1)
#print nltk.pos_tag(text1)

ex2 = "my mum got me a tofee apple company.. Diabetes is definately tryna proceed to me LOL"
#text2 = nltk.word_tokenize(ex2)
#print nltk.pos_tag(text2)

print CMUTweetTagger.runtagger_parse([
    "my mum got me a tofee apple company.. Diabetes is definately tryna proceed to me LOL"
])
 def transform(self, X):
     return CMUTweetTagger.runtagger_parse(X)
Пример #25
0
tofile = open(argv[3],"w") #file to take output arff
tofile.close()
idiomsEx = file.readlines()
list_type = file_type.readlines()

sociallists = [] # to take hashtags in a list

for line in idiomsEx:
	sociallists.append(line.replace("\n",""))

parsedSociallists = [] #parse the hashtags using str2num library and add them as a list

for line in sociallists:
	parsedSociallists.append(str2num.words2num(" ".join(ws.segment(line))))

postags = cmu.runtagger_parse(parsedSociallists) #gets a list of postags each for each hashtag

i = 0

for ParsedTag,postag,type in zip(parsedSociallists,postags,list_type):
	checkTweetsret = checkTweets.checkTweets(ParsedTag,"test/"+str(i/100)+"tweets.txt")
	#checks for the hashtag in the files provided.

	i+=1

	tofile = open(argv[3],"a")
	tofile.write(str(testFile1.test1(ParsedTag))+","+ #number of charcters in hashtag
	str(testFile2.test2(ParsedTag))+","+ #number of words in hashtag
	str(testFile4.test4(ParsedTag))+","+ #presence of days
	str(testFile5.numbercount(postag))+","+ # presence of numbers
	str(testFile5.prepositioncount(postag))+","+ #presence of prepositions
Пример #26
0
def DataPreprocessing(data, train=1):

    global docCount

    #EXTRACTING DENSE FEATURES
    sentiment = np.array([])
    word_count = np.array([])
    char_count = np.array([])
    sent_count = np.array([])
    syl_count = np.array([])
    mention_count = np.array([])
    url_count = np.array([])
    special_count = np.array([])
    cat_count = np.array([])
    dic = Pyphen(lang='en')
    for text in data["tweet"]:
        blob = TextBlob(text)

        #OPTIONAL SPELLING CORRECTION
        #data.loc[docCount,"tweet"]=str(blob.correct())
        #print(data.loc[docCount,"tweet"],type(data.loc[docCount,"tweet"]))

        url_count = np.append(url_count, blob.words.count("URL"))
        mention_count = np.append(mention_count, blob.words.count("USER"))
        cat_count = np.append(cat_count, sum(c == '#' for c in text))
        special_count = np.append(
            special_count,
            sum(not c.isalnum() and c != ' ' and c != '@' and c != '#'
                for c in text))
        syl_count = np.append(
            syl_count,
            len(TextBlob(dic.inserted(text).replace('-', ' ')).words))
        char_count = np.append(char_count, len(text))
        word_count = np.append(word_count, len(blob.words))
        sent_count = np.append(sent_count, len(blob.sentences))
        sentiment = np.append(sentiment, blob.sentiment.polarity)
        docCount += 1

    #INITIALIZING STEMMER AND STOP WORD CORPUS
    stop_words = set(stopwords.words('english'))
    porter_stemmer = PorterStemmer()

    #POS TAGGING
    POS = CMUTweetTagger.runtagger_parse(data["tweet"])
    POSDictionary = {
        "N": "nn",
        "O": "pro",
        "S": "np",
        "^": "nnps",
        "Z": "nnpz",
        "L": "vl",
        "M": "nv",
        "V": "md",
        "A": "adj",
        "R": "adv",
        "!": "int",
        "D": "det",
        "P": "ppt",
        "&": "cc",
        "T": "rp",
        "X": "ex",
        "Y": "exv",
        "#": "cat",
        "@": "tar",
        "~": "dsc",
        ",": "punc",
        "$": "num",
        "U": "url",
        "E": "emo",
        "G": "abr"
    }

    #PREPROCESSING (REMOVE STOP WORDS AND STEMMING)
    docCount = 0
    for doc in POS:
        filtered_sentence = []
        for word in doc:
            if word[0] not in stop_words:
                filtered_sentence.append(porter_stemmer.stem(
                    word[0]))  #+'_'+POSDictionary[word[1]])
        data.loc[docCount, "tweet"] = filtered_sentence
        data.loc[docCount, "tweet"] = " ".join(data.loc[docCount, "tweet"])
        docCount += 1

    #REPLACING LABEL (subtask) WITH INTEGER
    if (train == 1):
        data['label'] = data['subtask'].factorize()[0]
    data['sentiment'] = sentiment + 1
    data['sent_count'] = sent_count
    data['word_count'] = word_count
    data['syl_count'] = syl_count
    data['url_count'] = url_count
    data['mention_count'] = mention_count
    data['cat_count'] = cat_count
    data['special_count'] = special_count

    #SEPERATING FEATURES AND LABELS
    X = data[[
        'tweet', 'sentiment', 'sent_count', 'word_count', 'syl_count',
        'url_count', 'mention_count', 'special_count', 'cat_count'
    ]]
    if train == 1:
        y = data['label']
    else:
        y = None
    return X, y
Пример #27
0
@Licence :
	This work is licensed under the
	Creative Commons Attribution-NonCommercial-ShareAlike 4.0
	International License. To view a copy of this license,
	visit http://creativecommons.org/licenses/by-nc-sa/4.0/.
'''

import CMUTweetTagger as cmu
import wordsegment as ws

file1 = open()
file2 = open()

data1 = file1.read()
data2 = file2.read()

tweets1 = data1.split("\n\n")

hashtags = []

for tweet1 in tweets1:
	hashtag = tweet1.split("\n")[0]
	hashtags.append(" ".join(ws.segment(hashtag)))

postags = cmu.runtagger_parse(hashtags)

i=0

for postag in postags:
	if '$' in "".join(postag):
		i+=1
Пример #28
0
file = open(argv[1]) #file containing socialList and nonsocialList hashtags
tofile = open(argv[2], "w") #file that takes the arff output
tofile.close()
idiomsEx = file.readlines()
sociallists = []

for line in idiomsEx:
	sociallists.append(line.replace("\n", ""))

parsedSociallists = []

for line in sociallists:
	parsedSociallists.append(" ".join(ws.segment(line)))

postags = cmu.runtagger_parse(parsedSociallists)

'''
file output would be in the format of popularity,precision at 10,precision at 20 in each line for every hashtag

This takes a lot of time to run.
'''

for ParsedTag, postag in zip(parsedSociallists, postags):
	tofile = open(argv[2], "a")
	a = testFile14.test14(ParsedTag, postag)
	#checks the hashtag in google and returns list of its popularity precision at 10 urls and 20 urls
	print str(a[0]) + "," + str(a[1]) + "," + str(a[2])
	tofile.write(str(a[0]) + "," + str(a[1]) + "," + str(a[2]) + "\n")
	tofile.close()
Пример #29
0
file = open('../socialList.txt')
idiomsEx = file.readlines()
arr = []
i = 0

idioms = []

strlength = 0
word = 0
for x in idiomsEx:
    a = segment(x.replace("\n",""))
    strlength += len(x.replace("\n",""))
    idioms.append(" ".join(a))
    word = word+len(a)

postags = cmu.runtagger_parse(idioms)

count = len(postags)
nouns = 0
pronouns = 0
conjunctions = 0
interjections = 0
verbs = 0
adjectives = 0
adverbs = 0
prepositions = 0

for x in postags:
    tagscount = Counter(x)
    nouns += tagscount['N']+tagscount['^']
    pronouns += tagscount['O']
Пример #30
0
            for clfreq in freqTwCl.most_common(50):
                cl = clfreq[0]
                freq = clfreq[1]
                cluster_score[cl] = 0
                if freq >= freq_th:
                    #print "\n(cluster, freq):", clfreq
                    clidx = (npindL == cl).nonzero()[0].tolist()
                    cluster_centroid = X[clidx].sum(axis=0)
                    #print "centroid_array:", cluster_centroid
                    try:
                        # cluster_tweet = vectorizer.inverse_transform(cluster_centroid)
                        #get the words closest to center
                        sim_word_list = model.most_similar(
                            positive=[cluster_centroid], topn=20)

                        pos_tokens = CMUTweetTagger.runtagger_parse(
                            [term.upper() for term[0] in sim_word_list])
                        #print "detect entities", pos_tokens
                        score = 0
                        for l in pos_tokens:
                            term = ''

                            for gr in range(0, len(l)):
                                term += l[gr][0].lower() + " "
                            if "^" in str(l):
                                score += 2.5
                            else:
                                score += 1.0

                        cluster_score[cl] = score

                    except:
            print "Xclean.shape:", Xclean.shape
            #print map_index_after_cleaning
            #play with scaling of X
            X = Xclean
            Xdense = np.matrix(X).astype('float')
            X_scaled = preprocessing.scale(Xdense)
            X_normalized = preprocessing.normalize(X_scaled, norm='l2')
            #transpose X to get features on the rows
            #Xt = X_scaled.T
            # 				#print "Xt.shape:", Xt.shape
            vocX = vectorizer.get_feature_names()
            #print "Vocabulary (tweets):", vocX
            #sys.exit()

            boost_entity = {}
            pos_tokens = CMUTweetTagger.runtagger_parse(
                [term.upper() for term in vocX])
            #print "detect entities", pos_tokens
            for l in pos_tokens:
                term = ''
                for gr in range(0, len(l)):
                    term += l[gr][0].lower() + " "
                if "^" in str(l):
                    boost_entity[term.strip()] = 2.5
                else:
                    boost_entity[term.strip()] = 1.0
# 				print "boost_entity",  sorted( ((v,k) for k,v in boost_entity.iteritems()), reverse=True)

#  				boost_term_in_article = {}
#  				for term in vocX:
#   					if term in vocA:
#  						#print "boost term in article:", term, vocA
Пример #32
0
def create_resource_list(global_need_resource_list, need_text):
    count = 0
    for text in need_text:
        #output_test_file.write(str(count+1)+": "+text+"\n")
        source_list_3 = []

        urls = re.findall(web_url, text)
        for i in urls:
            if len(i) > len('http://t.co'):
                source_list_3.append(i)

        text2 = tweet_preprocess(text)
        need_cmu_tags = CMUTweetTagger.runtagger_parse([text2])

        text = tweet_preprocess2(text)
        quantity_dict = {}
        final_resource_keys = []
        source_list = []
        loc_list = []
        poss_places = []
        org_person_list = []
        quantity_dict, final_resource_keys, source_list, poss_places, org_person_list = common_nouns.get_resource(
            text)

        for i in source_list_3:
            source_list.append(i)

        # print(count)
        print(text)
        doc = nlp(text)
        #need_tag.append(CMUTweetTagger.runtagger_parse([text]))

        loc_list = proper_noun.give_location(need_cmu_tags)

        for i in org_person_list:
            if i in loc_list:
                try:
                    loc_list.remove(i)
                except:
                    continue
            if i not in source_list:
                source_list.append(i)

        for i in loc_list:
            if i in source_list:
                try:
                    source_list.remove(i)
                except:
                    continue

        for i in poss_places:
            if i not in loc_list:  #and location.is_inside_Nepal(i)==1:
                loc_list.append(i)

        for i in org_person_list:
            if i in final_resource_keys:
                try:
                    final_resource_keys.remove(i)
                except:
                    continue

        count = count + 1
        final_resource_lists = []
        for key in final_resource_keys:
            if key in quantity_dict:
                final_resource_lists.append(key.split(' ')[-1])
                continue
            if key in text:
                final_resource_lists.append(key)

        post_preprocess(text, global_need_resource_list, final_resource_lists,
                        quantity_dict, loc_list, source_list)
Пример #33
0
def getTweets(query):
    #pp = pprint.PrettyPrinter(indent=4)

    try:
        if not sample:
            tso = TwitterSearchOrder()  # create a TwitterSearchOrder object
            print(query)
            tso.set_keywords([
                query
            ])  # let's define all words we would like to have a look for
            tso.set_language('en')  # we want to see English tweets only
            tso.set_include_entities(
                False
            )  # and don't give us all those entity information   #from original code, idk what it does
            #my API data.
            ts = TwitterSearch(
                consumer_key='SwtLcZe9Im6q998K4cJqANs4n',
                consumer_secret=
                '7PMRM3ec7ltINPVl72FXurMn8Qg9HrS1NKwocYJVlTGngEFbEA',
                access_token=
                '51466054-cJUBESD4H9THIQExiKQ1HOGdR0GflXdyeIeL0TfKw',
                access_token_secret=
                'nn3ESWtluVoLSNFexAKcEesF6rEg0lTJ4QaIbFHJACFDr')

        count = 1000  #how many tweets we want to see. we want as many as possible, but do not want to sacrifice load time too much
        i = 0
        tweet_list = []
        if sample:
            print("Reading Sample File")
            for line in file.read().split('\n'):
                tweet_list.append(line)
        else:
            print("Searching....")
            for tweet in ts.search_tweets_iterable(tso):

                if i >= count:
                    break  #stops getting tweets when we have enough

                #keep this line below as a reference. from the original code:
                #print( '@%s tweeted: %s' % ( tweet['user']['screen_name'], tweet['text'] ) )

                words = tweet['text']
                start = re.search("(((RT )?@(\w)*) ?:? )?", words)
                words = words.lstrip(start.group(0))
                tweet_list.append(words)
                i += 1
            # if we have less than 1000 tweets, the corpus is too short.
            if (len(tweet_list) < 1000):
                print(
                    "Sorry! Your search did not return enough results, please try another."
                )
                return
            print("Search complete!")
        print("Tagging...")

        tagged = CMU.runtagger_parse(sent_tokenize(
            "\n".join(tweet_list)))  #tweetset))
        print("Tagging complete!")
        print("Analyzing tags...")

        tag_table = Process.create_rules(tagged)
        syl_rules = Process.get_pos_syllables(tagged)
        rhyme_pos_table = SCD.rhyme_to_POS(tagged)
        print("Analysis Complete!")
        print("Generating poetry...")
        result1 = Process.generate_firsttwo(tag_table, syl_rules)
        r1 = result1[1]
        r2 = result1[2]
        firsttwo = result1[0]
        result2 = Process.generate_lasttwo(tag_table, syl_rules,
                                           rhyme_pos_table, r1, r2)
        lasttwo = result2
        print("A poem about " + query + ":")
        print()
        print(firsttwo)
        print(lasttwo)

    except TwitterSearchException as e:  # take care of all those ugly errors if there are some
        print(e)
sys.path.append(os.path.join(CMU_TWEET, "ark_tweet_nlp_python"))

import CMUTweetTagger

inp = sys.argv[1]
DATA = sys.argv[2]

tweets = []
with open(os.path.join(DATA, inp), "r") as f:
    for i, line in enumerate(f):
        fields = line.rstrip("\n").split("\t")
        tweets.append(fields[3].decode("utf-8"))

tweets_parsed = CMUTweetTagger.runtagger_parse(
    tweets,
    run_tagger_cmd="java -XX:ParallelGCThreads=2 -Xmx500m -jar " +
    os.path.join(CMU_TWEET, "ark-tweet-nlp-0.3.2.jar"))
data = []

with io.open(os.path.join(DATA, inp + ".proc"), "w", encoding="utf-8") as w:
    with open(os.path.join(DATA, inp), "r") as f:
        for i, line in enumerate(f):
            fields = line.rstrip("\n").split("\t")
            instance = {}
            instance["tweetid"] = fields[0]
            instance["userid"] = fields[1]
            instance["sentiment"] = fields[2]
            instance["tweet"] = [
                e[0].decode("utf-8") for e in tweets_parsed[i]
            ]
            instance["pos"] = [e[1] for e in tweets_parsed[i]]
Пример #35
0
lemmatize_text(cleaned_text[5])

"""
Fun with the CMU tagger

https://github.com/brendano/ark-tweet-nlp
http://www.ark.cs.cmu.edu/TweetNLP/
https://github.com/ianozsvald/ark-tweet-nlp-python
"""

import CMUTweetTagger # will wrap this with a web-service

for text in cleaned_text[:10]:
    print text
    print CMUTweetTagger.runtagger_parse([text])
    print


#output:
#AT_USER $aapl. apple's iphone has cracked.
#[[('AT_USER', 'P', 0.5752), ('$aapl', '^', 0.7174), ('.', ',', 0.9668), ("apple's", 'Z', 0.6764), ('iphone', '^', 0.7309), ('has', 'V', 0.9833), ('cracked', 'V', 0.5413), ('.', ',', 0.9983)]]

#$aapl is holding well in the bull flag. did you notice the golden cross on the daily? ;) URL
#[[('$aapl', '^', 0.8645), ('is', 'V', 0.9961), ('holding', 'V', 0.9728), ('well', 'R', 0.8528), ('in', 'P', 0.9986), ('the', 'D', 0.9991), ('bull', 'N', 0.9745), ('flag', 'N', 0.9849), ('.', ',', 0.9979), ('did', 'V', 0.9994), ('you', 'O', 0.9957), ('notice', 'V', 0.9922), ('the', 'D', 0.999), ('golden', 'A', 0.4243), ('cross', 'N', 0.9899), ('on', 'P', 0.9987), ('the', 'D', 0.9991), ('daily', 'A', 0.5749), ('?', ',', 0.9897), (';)', 'E', 0.9774), ('URL', 'N', 0.4083)]]




"""
Interesting projects:
Пример #36
0
    def build_graph(self, tweet):

        # input of this function is a text in the str format, "this is an sample input"
        exp_context_list_dict, meds_disease_dict, phrase_synonym_dict = self.get_ctxt_exp_list(
        )
        cluster_dict = self.get_cluster_data()
        # Getting Np/NE
        cleaned_tweet = self.clean_tweets(tweet)
        entity_results = CMUTweetTagger.runtagger_parse(
            [cleaned_tweet])  # the input should be a list of texts
        #     print("3- entity_results:" , entity_results)
        print("--- End Tagging Tweets ---")
        print("Tagged Ents: ", len(entity_results))

        # For each tweet
        for i in tqdm(range(len(entity_results))):
            phrases_list = set(self.new_ne_extraction(entity_results[i]))
            type_list = []
            b_syn_list = []
            topic_list = []
            disease_list = []
            topic_links = []
            if len(phrases_list) > 0:
                for ent in phrases_list:
                    ent = (ent.replace("#", "")).lower().strip()
                    ent = ''.join(e for e in ent if e.isalnum())
                    # if len(ent)>0:
                    #     if ent[0].isalpha() == False:
                    #         ent = ent[1:]
                    # Types
                    type_word = None
                    if ent in exp_context_list_dict.keys():
                        type_word = exp_context_list_dict.get(ent)
                        type_list.append(type_word)
                    else:
                        type_list.append("")
                    # B-syn list
                    if ent in phrase_synonym_dict.keys():
                        synonym = phrase_synonym_dict.get(ent)
                        b_syn_list.append(synonym)
                    else:
                        b_syn_list.append("")
                    # if "blue" in ent.lower().strip():
                    #         print("Hit   ", ent.lower().strip())
                    if ent.lower().strip() in cluster_dict.keys():
                        topic_list.append(cluster_dict.get(
                            ent.lower().strip()))

                    else:
                        topic_list.append("")

                    if type_word == "medication":
                        if ent in meds_disease_dict.keys():
                            disease_list.append(meds_disease_dict.get(ent))
                        else:
                            disease_list.append("")
                    else:
                        disease_list.append("")

            topics = list(set(topic_list))

            if "" in topics:
                topics.remove("")

            for (phrase, typ, b_syn, topic, m) in zip(phrases_list, type_list,
                                                      b_syn_list, topic_list,
                                                      disease_list):
                if typ != "" or topic != "":
                    topic_links.append((topic, phrase, b_syn, m, typ))

            if topic_links == []:
                # print("Hit")
                continue
#                 print(phrase,",",typ,",",b_syn,",",topic,",",m)

            tweet_dict = {}
            tweet_dict["topics"] = topics
            tweet_dict["topic_links"] = topic_links
            tweet_dict["tweet"] = tweet

#             print(tweet_dict)
#             print("--------------------------------------------")

        return tweet_dict
Пример #37
0
import nltk
import CMUTweetTagger

ex1 = "Before what happened at lunch when someone decided to piss me off.I had so much at the diabetes walk. Especially with my girls #TWERKTEAM!"
#text1 = nltk.word_tokenize(ex1)
#print nltk.pos_tag(text1)

ex2 = "my mum got me a tofee apple company.. Diabetes is definately tryna proceed to me LOL"
#text2 = nltk.word_tokenize(ex2)
#print nltk.pos_tag(text2)

print CMUTweetTagger.runtagger_parse(["my mum got me a tofee apple company.. Diabetes is definately tryna proceed to me LOL"])
def corpus_maker(tweets,positive_classification_type,corpus_filename ,negative_classification_type=""):
	

	y = []
	corpus = []


	corpus_dict = {}
	count = 0
	miss_count=0
	neg_count =0

	labeled_tweet_ids = set()

	vectorizer = StemmedTfidfVectorizer(min_df=1, stop_words='english', charset_error='ignore')

	tweets_list=[]


	for row in tweets:
		# try:
		# 	row = i.decode('utf-8', 'ignore').split('?!~')
		# except UnicodeEncodeError as e:
		# 	miss_count += 1
		# 	continue

		if(len(row) == 5):
			tweet_id = row[1]
			tweet_text = row[2].strip().replace("\n","")
			classification = row[4].strip()
			if classification == positive_classification_type:
				label = 1
				tweets_list.append(tweet_text)
				y.append(label)

			# this is where the problem is 
			elif classification != positive_classification_type:
				# print classification
				label = -1
				tweets_list.append(tweet_text)
				y.append(label)
				neg_count += 1

			labeled_tweet_ids.add(tweet_id)

	tagged_tweets = CMUTweetTagger.runtagger_parse(tweets_list, run_tagger_cmd=tweet_stack_command)

	print "len tagged tweets " + str(len(tweets_list))
	print "len tagged tweets " + str(len(tagged_tweets))

	for tagged_tweet in tagged_tweets:
		temp_list = [] # filtered words of a tweet
		for word in tagged_tweet:
			if word[1] != "U":
				temp_list.append(word[0].lower())

		filtered_tweet = ' '.join(temp_list)
		corpus.append(filtered_tweet)
		corpus_dict[filtered_tweet] = tweet_id
		
		count += 1

	vectorizer = StemmedTfidfVectorizer(min_df=1, stop_words='english', charset_error='ignore')



	counts = vectorizer.fit_transform(corpus) # rename counts to tfiidf_counts

	print vectorizer.get_feature_names()

	#transformer = TfidfTransformer()
	x = counts #transformer.fit_transform(counts)
	serialize_object( (y,x,vectorizer,labeled_tweet_ids), corpus_filename)

	return (y,x,vectorizer,labeled_tweet_ids)
Пример #39
0
tofile = open(argv[3],"w") #file to take output arff
tofile.close()
idiomsEx = file.readlines()
list_type = file_type.readlines()

sociallists = [] # to take hashtags in a list

for line in idiomsEx:
	sociallists.append(line.replace("\n",""))

parsedSociallists = [] #parse the hashtags using str2num library and add them as a list

for line in sociallists:
	parsedSociallists.append(str2num.words2num(" ".join(ws.segment(line))))

postags = cmu.runtagger_parse(parsedSociallists) #gets a list of postags each for each hashtag

i = 0

for ParsedTag,postag,type in zip(parsedSociallists,postags,list_type):
	checkTweetsret = checkTweets.checkTweets(ParsedTag.replace(" ",""),"test/"+str(i/100)+"tweets.txt")
	#checks for the hashtag in the files provided.

	i+=1

	tofile = open(argv[3],"a")
	tofile.write(str(testFile1.test1(ParsedTag))+","+ #number of charcters in hashtag
	str(testFile2.test2(ParsedTag))+","+ #number of words in hashtag
	str(testFile4.test4(ParsedTag))+","+ #presence of days
	str(testFile5.numbercount(postag))+","+ # presence of numbers
	str(testFile5.prepositioncount(postag))+","+ #presence of prepositions
Пример #40
0
@Licence :
	This work is licensed under the
	Creative Commons Attribution-NonCommercial-ShareAlike 4.0
	International License. To view a copy of this license,
	visit http://creativecommons.org/licenses/by-nc-sa/4.0/.
'''

import CMUTweetTagger as cmu
import wordsegment as ws

file1 = open()
file2 = open()

data1 = file1.read()
data2 = file2.read()

tweets1 = data1.split("\n\n")

hashtags = []

for tweet1 in tweets1:
    hashtag = tweet1.split("\n")[0]
    hashtags.append(" ".join(ws.segment(hashtag)))

postags = cmu.runtagger_parse(hashtags)

i = 0

for postag in postags:
    if '$' in "".join(postag):
        i += 1
Пример #41
0
import operator

import my_feature_ex as fx
import my_word_cloud as wcloud

# <codecell>

import nltk
import json
sys.path.append('/Users/doug/SW_Dev/ark-tweet-nlp-0.3.2')
import CMUTweetTagger
#print CMUTweetTagger.runtagger_parse(['example tweet 1', '@foo example tweet 2'])

RUN_TAGGER_CMD = "java -XX:ParallelGCThreads=2 -Xmx500m -jar /Users/doug/SW_Dev/ark-tweet-nlp-0.3.2/ark-tweet-nlp-0.3.2.jar"
RUN_TAGGER_CMD_PTB = "java -XX:ParallelGCThreads=2 -Xmx500m -jar /Users/doug/SW_Dev/ark-tweet-nlp-0.3.2/ark-tweet-nlp-0.3.2.jar --model /Users/doug/SW_Dev/ark-tweet-nlp-0.3.2/model.ritter_ptb_alldata_fixed.20130723.txt"
print CMUTweetTagger.runtagger_parse(['example tweet 1', 'example tweet 2'], run_tagger_cmd=RUN_TAGGER_CMD)
print CMUTweetTagger.runtagger_parse(['example tweet 1', 'example tweet 2'], run_tagger_cmd=RUN_TAGGER_CMD_PTB)

def annotate_pos(tweets, ptb=False):
    if ptb:
        tagger_cmd = RUN_TAGGER_CMD_PTB
    else:
        tagger_cmd = RUN_TAGGER_CMD
    ids = []
    texts = []
    for key, value in tweets.items():
        ids.append(key)
        texts.append(json.dumps({'text':value['text']}))
    pos = CMUTweetTagger.runtagger_parse(texts, run_tagger_cmd=tagger_cmd)
    
    if len(ids) != len(pos):
 				print "Xclean.shape:", Xclean.shape
 				#print map_index_after_cleaning
				#play with scaling of X
				X = Xclean
				Xdense = np.matrix(X).astype('float')
				X_scaled = preprocessing.scale(Xdense)
				X_normalized = preprocessing.normalize(X_scaled, norm='l2')
				#transpose X to get features on the rows
				#Xt = X_scaled.T
# 				#print "Xt.shape:", Xt.shape
 				vocX = vectorizer.get_feature_names()
 				#print "Vocabulary (tweets):", vocX
 				#sys.exit()
 				
 				boost_entity = {}
 				pos_tokens = CMUTweetTagger.runtagger_parse([term.upper() for term in vocX])
 				#print "detect entities", pos_tokens
 				for l in pos_tokens:
 					term =''
 					for gr in range(0, len(l)):
 						term += l[gr][0].lower() + " "
  					if "^" in str(l):
 						boost_entity[term.strip()] = 2.5
 					else: 	 		
 				 		boost_entity[term.strip()] = 1.0
# 				print "boost_entity",  sorted( ((v,k) for k,v in boost_entity.iteritems()), reverse=True)	
 				
#  				boost_term_in_article = {}
#  				for term in vocX:
#   					if term in vocA:
#  						#print "boost term in article:", term, vocA