示例#1
0
def main():
    args = argument_parser.main()
    global sql
    sql = SQLConnector(host=args.host, port=args.port, user=args.user, passwd=args.password, db=args.db)
    global bing
    bing = BingSearch()
    global new_software
    new_software = NewSoftware()
    global possible_tags
    possible_tags = []
    mongo = MongoConnector(host=args.H, db=args.db)
    for page in range(1):
        res = sql.load_data(page)
        rows = res.num_rows()
        if not rows:
            print "No tweets left to analyse"
            break

        for _i_ in range(1):  # rows):
            for tweet in res.fetch_row():
                tweet_id = str(tweet[0])
                text = tweet[1].lower()
                # text = "Version 2 Microsoft just released MS Office ver 3.20.2 for 99 cent 100c 10ps 13pence 10 pence"

                urls = find_url(text)
                for url in urls:
                    text = text.replace(url, "").strip()

                versions = find_version(text)

                words = regexp_tokenize(text, pattern=r"\w+([.,]\w+)*|\S+")
                # print words
                prices = find_price(words)

                pos_ = pos(words)
                ngram = ngrams(words, 5)

                try:
                    tagged_tweet = tag_tweets(ngram, tweet_id)
                    tagged_tweet.add("tweet_text", text)
                    tagged_tweet.add("sentiment", tweet[2])
                    tagged_tweet.add("url", urls)
                    tagged_tweet.add("version", versions)
                    tagged_tweet.add("price", prices)
                    if tweet_id in possible_tags:
                        print tweet_id
                    else:
                        if tagged_tweet.contains("software_id") or tagged_tweet.contains("operating_system_id"):
                            print tweet
                            print tagged_tweet
                            print
                            # mongo.insert(tagged_tweet)
                        else:
                            print tweet, "No software"
                        # sql.setTagged(tagged_tweet.get('tweet_db_id'))
                except IncompleteTaggingError, e:
                    # This will allow the tweet to be tagged again at a later stage
                    print tweet_id + ":", e
                    print tweet
                    print
示例#2
0
 def _create_ngram(self, tokenized, gram_length):
     pos_ = pos(tokenized)
     #print pos_
     gram = None
     while not gram: # In case tweet length less than gram_length
         gram = ngrams(pos_, gram_length)
         gram_length -= 1
     return gram