def main(): args = argument_parser.main() global sql sql = SQLConnector(host=args.host, port=args.port, user=args.user, passwd=args.password, db=args.db) global bing bing = BingSearch() global new_software new_software = NewSoftware() global possible_tags possible_tags = [] mongo = MongoConnector(host=args.H, db=args.db) for page in range(1): res = sql.load_data(page) rows = res.num_rows() if not rows: print "No tweets left to analyse" break for _i_ in range(1): # rows): for tweet in res.fetch_row(): tweet_id = str(tweet[0]) text = tweet[1].lower() # text = "Version 2 Microsoft just released MS Office ver 3.20.2 for 99 cent 100c 10ps 13pence 10 pence" urls = find_url(text) for url in urls: text = text.replace(url, "").strip() versions = find_version(text) words = regexp_tokenize(text, pattern=r"\w+([.,]\w+)*|\S+") # print words prices = find_price(words) pos_ = pos(words) ngram = ngrams(words, 5) try: tagged_tweet = tag_tweets(ngram, tweet_id) tagged_tweet.add("tweet_text", text) tagged_tweet.add("sentiment", tweet[2]) tagged_tweet.add("url", urls) tagged_tweet.add("version", versions) tagged_tweet.add("price", prices) if tweet_id in possible_tags: print tweet_id else: if tagged_tweet.contains("software_id") or tagged_tweet.contains("operating_system_id"): print tweet print tagged_tweet print # mongo.insert(tagged_tweet) else: print tweet, "No software" # sql.setTagged(tagged_tweet.get('tweet_db_id')) except IncompleteTaggingError, e: # This will allow the tweet to be tagged again at a later stage print tweet_id + ":", e print tweet print
def _create_ngram(self, tokenized, gram_length): pos_ = pos(tokenized) #print pos_ gram = None while not gram: # In case tweet length less than gram_length gram = ngrams(pos_, gram_length) gram_length -= 1 return gram