class ImgCreator(object): def __init__(self, mongo=None, **kwargs): if not mongo: self._mongo = MongoConnector(host=kwargs["H"], port=kwargs["mongoport"], db=kwargs["db"]) else: self._mongo = mongo def get_cursor(self, word): return self._mongo.cursor(word) def web_query(self, word): word = word.lower() return self.analyse(self.get_cursor(word), word) def query(self, *args): if len(args) == 0: self._show_all() else: for word in args: self.analyse(self.get_cursor(word), word) def _show_all(self): tools = self._mongo.find_all() print tools for word in tools: self.analyse(self.get_cursor(word), word) def close(self): self._mongo.close() def analyse(self, cursor, tool): sentiments = cursor.distinct("sentiment") data = [] colours = [] for sentiment in sentiments: count = 0 for tag in cursor: if "sentiment" in tag: if sentiment == tag["sentiment"]: count += 1 data.append([str(sentiment), count]) cursor.rewind() if sentiment == "neutral": colours.append("blue") elif sentiment == "positive": colours.append("green") elif sentiment == "negative": colours.append("red") cursor.close() return data, colours
def __init__(self, sql=None, mongo=None, **kwargs): super(TweetTagger, self).__init__() if not sql: self._sql = SQLConnector(host=kwargs['host'], port=kwargs['port'], user=kwargs['user'], passwd=kwargs['password'], db=kwargs['db']) else: self._sql = sql if not mongo: self._mongo = MongoConnector(host=kwargs['H'], port=kwargs['mongoport'], db=kwargs['db']) else: self._mongo = mongo self._keyword = None self._bing = BingSearch() self._binged = Dictionary()
class TweetTagger(object): def __init__(self, sql=None, mongo=None, **kwargs): super(TweetTagger, self).__init__() if not sql: self._sql = SQLConnector(host=kwargs['host'], port=kwargs['port'], user=kwargs['user'], passwd=kwargs['password'], db=kwargs['db']) else: self._sql = sql if not mongo: self._mongo = MongoConnector(host=kwargs['H'], port=kwargs['mongoport'], db=kwargs['db']) else: self._mongo = mongo self._keyword = None self._bing = BingSearch() self._binged = Dictionary() def _tag(self, tweet): tweet_id = str(tweet[0]) original = tweet[1].decode('utf-8', 'ignore') text = original.lower().replace('#','').strip() #text = "download 60 hundred pounds 72 million $800 billion pounds holiday havoc v2 in itunes for free 99" urls = find_url(original) for url in urls: text = text.replace(url.lower(), "").strip() word_freqs = word_frequencies(text) #print word_freqs versions = find_version(text) words = regexp_tokenize(text, pattern=r'\w+([.,]\w+)*|\S+') prices = find_price(words) five_gram = self._create_ngram(tokenized=words, gram_length=5) tagged_tweet = self._ngram_tagger(five_gram, tweet_id) tagged_tweet.add('sentiment', tweet[2]) tagged_tweet.add('tweet', original) tagged_tweet.add('url', urls) tagged_tweet.add('version', versions) tagged_tweet.add('price', prices) if tagged_tweet.contains('software_name'): query = {'software_name':tagged_tweet.get('software_name')} words = {} for w in word_freqs: words['words.'+w] = word_freqs[w] #print query, words self._mongo.update_freqs(query,words) return tagged_tweet def _create_ngram(self, tokenized, gram_length): pos_ = pos(tokenized) #print pos_ gram = None while not gram: # In case tweet length less than gram_length gram = ngrams(pos_, gram_length) gram_length -= 1 return gram def _ngram_tagger(self, ngram, tweet_id): tags = Dictionary() tags.add('tweet_db_id', tweet_id) if self._keyword: keyword = self._keyword if self._sql.isSoftware(keyword): entry = self._sql.getSoftware() tags.add('software_name', keyword) tags.add('software_id', str(entry[0])) elif self._sql.isCompany(keyword): entry = self._sql.getCompany() tags.add('company_name', keyword) tags.add('company_id', str(entry[0])) elif self._sql.isOS(keyword): entry = self._sql.getOS() tags.add('os_name', keyword) tags.add('os_id', str(entry[0])) for tagged_words in ngram: self._tagger(tagged_words, tags) print 'lol' print '2',tags return tags def _tagger(self, gram, tags): words = [] tags_ = [] phrase = "" pos_soft = "" possible_software = False # Compile regular expressions outside of for loop # for efficiency purposes free_price = re.compile(r'^free$', re.IGNORECASE) check_is = re.compile(r'^is$|^for$', re.IGNORECASE) check_get = re.compile(r'^download$|^get$', re.IGNORECASE) check_on = re.compile(r'^on$|^for$', re.IGNORECASE) for tagged_word in gram: word = tagged_word[0] tag = tagged_word[1] phrase += word + " " #print word, tag try: if tagIsNoun(tag): if self._sql.isSoftware(word): entry = self._sql.getSoftware() try: prev_tag = tags_.pop() tags_.append(prev_tag) if not tagIsDeterminantOrPreposition(prev_tag): tags.add('software_name',word) tags.add('software_id', str(entry[0])) except: possible_software = True elif self._sql.isCompany(word): entry = self._sql.getCompany() try: prev_tag = tags_.pop() tags_.append(prev_tag) if not tagIsDeterminantOrPreposition(prev_tag): raise # Add to tags except: tags.add('company_name',word) tags.add('company_id', str(entry[0])) elif self._sql.isProgLang(word): entry = self._sql.getProgLang() try: prev_tag = tags_.pop() tags_.append(prev_tag) if not tagIsDeterminantOrPreposition(prev_tag): raise # Add to tags except: tags.add('programming_language_name', word) tags.add('programming_language_id', str(entry[0])) if self._sql.isOS(word): entry = self._sql.getOS() try: prev_tag = tags_.pop() tags_.append(prev_tag) prev = words.pop() words.append(prev) if not tagIsDeterminantOrPreposition(prev_tag) or re.match(check_on, prev): tags.add('os_name', word) tags.add('os_id', str(entry[0])) except: possible_software = True except ProgrammingError: pass if possible_software: if tagIsNoun(tag): pos_soft += word + " " if word == gram[len(gram)-1][0]: # If 'word' is last word in n-gram pos_soft = "" else: prev = words.pop() words.append(prev) if not re.match(check_get, prev): if check_version(word): tags.add('version', word) possible_software = False if re.match(free_price, word): try: prev = words.pop() words.append(prev) if re.match(check_is, prev): tags.add('price', word) else: prev = tags_.pop() tags_.append(prev) if tagIsNoun(prev): tags.add('price', word) except: # This is first word in phrase pass elif re.match(check_get, word): possible_software = True # Back in main part of loop words.append(word) tags_.append(tag) # End of for loop phrase = phrase.strip() if len(pos_soft) > 0: pos_soft = pos_soft.strip() if not tags.get('software_name'): if self._binged.contains(pos_soft): if self._binged.get(pos_soft): tags.add('software_name', pos_soft) else: try: bool_ = check_bing(pos_soft, self._bing) self._binged[pos_soft]= bool_ if bool_: # Insert into dictionary db? tags.add('software_name', pos_soft) except ServerError, e: print e raise IncompleteTaggingError() print '1',tags
def __init__(self, mongo=None, **kwargs): if not mongo: self._mongo = MongoConnector(host=kwargs["H"], port=kwargs["mongoport"], db=kwargs["db"]) else: self._mongo = mongo