Пример #1
0
 def start(self, **kwargs):
     only_campaign= kwargs.get('campaign', None)
     regenerate_all = kwargs.get('regenerate', False)
     while True:
         end = self.getCurrentSummarizationEnd()
         for account in MongoManager.getActiveAccounts(max_age=timedelta(hours=1)):
             for campaign in account.getActiveCampaigns():
                 MongoManager.ensureIndex('summarized_tweets_%s' % campaign.getId(), [("start", 1)])
                 if only_campaign and only_campaign.getId() != campaign.getId(): continue
                 if regenerate_all:
                     self.clearSummarization(campaign)
                     collection_name = 'tweets_%s' % campaign.getId()
                     res = MongoManager.findTweets(collection_name, sort=("x_created_at", 1), limit=1)
                     if res.count():
                         lsd = res[0]['x_created_at'].replace(minute=0, second=0, microsecond=0)
                     else:
                         lsd = datetime.now().replace(minute=0, second=0, microsecond=0)
                 else:
                     lsd = self.getLastSummarizedDate(campaign)
                 if lsd < end:
                     while lsd < end:
                         self.summarize(campaign, lsd, min(end, lsd + timedelta(days=1)), timedelta(hours=1), None)
                         lsd = lsd + timedelta(days=1)
         pprint("sleeping 20 seconds")
         regenerate_all = False
         time.sleep(20)
Пример #2
0
 def processItem(self, entry):
     campaigns = entry['campaigns']
     del entry['campaigns']
     for campaign in campaigns:
         collection_name = "fb_posts_%s" % campaign.getId()
         #pprint("saving entry to campaign %s" % campaign.getName())
         MongoManager.saveDocument(collection_name, entry)
Пример #3
0
    def processItem(self, tweet):
        #accs = MongoManager.getActiveAccounts(max_age=timedelta(seconds=10)) // ES NECESARIO?? LO COMENTO POR AHORA
        #pprint (tweet)
        #pprint (tweet.getExtractedInfo())
        follow_accounts = MongoManager.getFollowAccountsbyCampaign(
            max_age=timedelta(seconds=10))
        bcs = ClassifierManager.getBrandClassifiers(
        )  #esto tendria que esta cacheado tambien en classifiermanager
        tcs = None
        pms = self.getBrandClassifiersByCampaign(
            tweet, bcs, follow_accounts
        )  ##FALTA AGREGAR TAMBIEN A LOS TWEETS QUE NO MATCHEAN PERO QUE SON DE UN USUARIO SEGUIDO POR LA MARCA
        #pprint(pms)
        for cid, pmlist in pms.items():
            if tcs is None: tcs = ClassifierManager.getTopicClassifiers()
            tms = self.getTopicClassifiers(tweet, cid, tcs)
            tweet.setExtractedTopics(tms)
            tweet.setExtractedInfo(pmlist)
            tweet.setGender(
                GenderClassifier.extractGender(tweet.getDisplayName()))
            tweet.resetFollowAccountsMentionCount()
            user_mentions = tweet.getUserMentions()
            for fa in follow_accounts:
                if fa in user_mentions:
                    for fainfo in follow_accounts[fa]:
                        if fainfo['cid'] == cid:
                            tweet.setFollowAccountsMentionCount(fa, 1)
            #pprint(pmlist)
            #pprint("saving tweet to campaign %s" % cid)
            MongoManager.saveDocument("tweets_%s" % cid, tweet.getDictionary())

        return None  #no devuelvo nada para que no se acumulen los tweets en la ultima lista y se sature la memoria
Пример #4
0
 def summarize(self, campaign, start, end, interval, tweetlist=None):
     collection_name = 'summarized_tweets_%s' % campaign.getId()
     timerange = self.calculateSummarizedIntervals(campaign, start, end, interval, tweetlist)
     for interv in timerange:
         res = MongoManager.findOne(collection_name, filters={'start': start, 'end': end})
         if res: interv['_id'] = res['_id']
         MongoManager.saveDocument(collection_name, interv)
Пример #5
0
 def run(self):
     year, month = self.findFirstMonth()
     #year = 2015
     #month = 2
     if not year or not month: return
     d = datetime(year, month, 1)
     while not self.finish_flag and d <= datetime.now():
        
         feed = self.getFeed(self.url + "/%s/%s/%s/feed" % (d.year, d.month, d.day))
         for entry in feed.entries:
             if self.finish_flag: break
             if entry.slash_comments > 0:
                 comments_feed = self.getFeed(entry.wfw_commentrss)
                 if comments_feed:
                     for comment_entry in comments_feed.entries:
                         fe = FeedEntry.fromFeedParserEntry(comments_feed.feed.link, comment_entry)
                         fe.account = self.account
                         fe.campaign = self.campaign
                         self.queue.put(fe)
                         
            
         d = d + timedelta(days=1)
         if d.day == 1: #cambio el mes, me fijo que si posts en el nuevo mes
             while not self.finish_flag and datetime.now():
                 dummy_feed = self.getFeed(self.url + "/%s/%s/feed" % (d.year, d.month))
                 if dummy_feed.entries: 
                     break
                 d = (d + timedelta(days=32)).replace(day=1)  #agrego 1 mes
     
     if d > datetime.now():    
         acc = MongoManager.getAccount(id=self.account.getId())
         camp = acc.getCampaign(id =self.campaign.getId())
         camp.addHistoryFetchedForum(self.url)
         MongoManager.saveCampaign(acc, camp)
Пример #6
0
def getListOfUsers():
	mng = MongoManager()
	dbResult=mng.get("tests") 
	response.set_header("Content-Type:","text/json")

	result =json.dumps(dbResult)
	return result
Пример #7
0
 def getLastSummarizedDate(self, campaign):
     collection_name = 'summarized_tweets_%s' % campaign.getId()
     res = MongoManager.find(collection_name, sort=("start", -1), limit=1)
     if res.count():
         return res[0]['end']
     else:
         collection_name = 'tweets_%s' % campaign.getId()
         res = MongoManager.findTweets(collection_name, sort=("x_created_at", 1), limit=1)
         if res.count():
             return res[0]['x_created_at'].replace(minute=0, second=0, microsecond=0)
         return datetime.now().replace(minute=0, second=0, microsecond=0)
Пример #8
0
 def summarize(self, campaign, start, end, interval, tweetlist=None):
     collection_name = 'summarized_tweets_%s' % campaign.getId()
     timerange = self.calculateSummarizedIntervals(campaign, start, end,
                                                   interval, tweetlist)
     for interv in timerange:
         res = MongoManager.findOne(collection_name,
                                    filters={
                                        'start': start,
                                        'end': end
                                    })
         if res: interv['_id'] = res['_id']
         MongoManager.saveDocument(collection_name, interv)
Пример #9
0
 def processItem(self, item):
     polls_ht = MongoManager.getPollsByHashtag(max_age=timedelta(
         seconds=10))
     tweet = Tweet.createFromUnknownSource(item)
     #pprint(tweet)
     for ht in tweet.getHashtags():
         if ht in polls_ht:
             for poll in polls_ht[ht]:
                 MongoManager.saveDocument("polls_" + poll.getId(),
                                           tweet.getDictionary())
                 #pprint("grabando tweet para poll %s" % poll.getName())
     return tweet
Пример #10
0
 def getLastSummarizedDate(self, campaign):
     collection_name = 'summarized_tweets_%s' % campaign.getId()
     res = MongoManager.find(collection_name, sort=("start", -1), limit=1)
     if res.count():
         return res[0]['end']
     else:
         collection_name = 'tweets_%s' % campaign.getId()
         res = MongoManager.findTweets(collection_name,
                                       sort=("x_created_at", 1),
                                       limit=1)
         if res.count():
             return res[0]['x_created_at'].replace(minute=0,
                                                   second=0,
                                                   microsecond=0)
         return datetime.now().replace(minute=0, second=0, microsecond=0)
Пример #11
0
 def getSummarizedData(self, campaign, start, end):
     collection_name = 'summarized_tweets_%s' % campaign.getId()
     #print 41, datetime.now()
     res = MongoManager.find(collection_name,
                             filters={
                                 'start': {
                                     "$gte": start,
                                     "$lte": end
                                 },
                                 'end': {
                                     "$lte": end
                                 }
                             },
                             sort=('start', 1))
     #pprint(res.explain())
     #print 43, datetime.now()
     #timerange = [SumDict(r) for r in res]
     timerange = list(res)
     #print 44, datetime.now()
     #for r in timerange:
     #    print r['start'], r['end'], r['stats']['total_tweets'], r['sentiment'], r.get('calculated', '')
     #print 44, datetime.now()
     #print timerange[-1]['end'], end
     if timerange and timerange[-1]['end'] < end:
         d = self.calculateSummarizedIntervals(campaign,
                                               timerange[-1]['end'], end,
                                               end - timerange[-1]['end'])
         #for k in d:
         #    k['calculated'] = True
         timerange.extend(d)
     #for r in timerange:
     #    print r['start'], r['end'], r['stats']['total_tweets'], r['sentiment'], r.get('calculated', '')
     #print 45, datetime.now()
     return timerange
Пример #12
0
 def generateBrandClassifier(self, br):
     bc = BrandClassifier()
     bc.account_id = br.account_id
     bc.account_name = br.account_name
     bc.campaign_id = br.campaign_id
     bc.campaign_name = br.campaign_name
     bc.score_threshold = br.score_threshold
     bc.name = {br.name: br.synonyms}
     bc.brand_confidence_clues = self.genClassifierClues(br.keywords)
     for kws in br.keyword_sets:
         if kws.getId():
             bc.brand_confidence_clues.append((kws.getValue(),) + tuple(MongoManager.getKeywordset(id=kws.getId()).getKeywords()))
     if br.rules:
         bc.brand_regexps = [(re.compile(self.getBrandRegexpFromRule(br, rule), re.I|re.U), rule) for rule in br.rules]
     pr_number = 0
     for pr in br.children:
         bc.product_list.append(pr.name)
         bc.products[pr.name] = pr.synonyms
         bc.product_regexps[pr.name] = []
         for rule in pr.rules:
             bc.product_regexps[pr.name].append((re.compile(self.getProductRegexpFromRule(br, pr, pr_number, rule), re.I|re.U), rule))
         if pr.use_brand_id_rules: 
             for rule in br.rules:
                 if rule.find("[P]") >= 0:
                     bc.product_regexps[pr.name].append((re.compile(self.getProductRegexpFromRule(br, pr, pr_number, rule), re.I|re.U), rule))
         pr_number += 1
         bc.product_confidence_clues[pr.name] = self.genClassifierClues(pr.keywords)
     return bc
Пример #13
0
 def getAllFeedURLs(self):
     res = []
     accs = MongoManager.getActiveAccounts()
     for acc in accs:
         for camp in acc.getActiveCampaigns():
             for url in camp.getForums():
                 res.append((acc, camp, url))
     return res
Пример #14
0
 def getAllFeedURLs(self):
     res = []
     accs = MongoManager.getActiveAccounts()
     for acc in accs:
         for camp in acc.getActiveCampaigns():
             for url in camp.getForums():
                 res.append((acc, camp, url))
     return res
Пример #15
0
 def generateTopicClassifier(self, topicdoc):
     tc = TopicClassifier()
     tc.topic_name = topicdoc.getName()
     tc.topic_id = str(topicdoc.getId())
     tc.topic_confidence_clues = self.genClassifierClues(topicdoc.getKeywords())
     for kws in topicdoc.getKeywordsets():
         tc.topic_confidence_clues.append((kws.getValue(),) + tuple(MongoManager.getKeywordset(id=kws.getId()).getKeywords()))
     return tc
Пример #16
0
 def getNamesDatabase(cls, **kwargs):
     max_age = kwargs.get('max_age', timedelta(seconds=0))
     if not max_age or not cls.cached_names_database or (datetime.now() - cls.cached_names_database['fetch_time'] > max_age):
         namesdb = MongoManager.find("gender_names")
         res = {}
         for name in namesdb:
             res[name["name"].lower()] = name["gender"]
         cls.cached_names_database = {'data': res, 'fetch_time': datetime.now()}
     return cls.cached_names_database['data']
Пример #17
0
 def generateGnipRulesFromMongo(self):
     accounts = MongoManager.getActiveAccounts()
     rules = []
     for acc in accounts:
         for camp in acc.getActiveCampaigns():
             for fp in camp.getFacebookFanpages():
                 #rules.append({"value": fp, "tag": "%s/%s/%s" % (acc.getName(), camp.getName(), fp)})
                 rules.append({"value": fp, "tag": None})
     return rules
Пример #18
0
 def getAllHistoryFeedURLs(self):
     res = []
     accs = MongoManager.getActiveAccounts()
     for acc in accs:
         for camp in acc.getActiveCampaigns():
             hff = camp.getHistoryFetchedForums()
             for url in camp.getForums():
                 if url not in hff:
                     res.append((acc, camp, url))
     return res
Пример #19
0
 def getAllHistoryFeedURLs(self):
     res = []
     accs = MongoManager.getActiveAccounts()
     for acc in accs:
         for camp in acc.getActiveCampaigns():
             hff = camp.getHistoryFetchedForums()
             for url in camp.getForums():
                 if url not in hff:
                     res.append((acc, camp, url))
     return res
Пример #20
0
 def getBrandClassifiers(cls):
     #faltaria buffer por max_age
     o = cls()
     accounts = MongoManager.getActiveAccounts(max_age=timedelta(seconds=10))
     rules = []
     for acc in accounts:
         rules.extend(o.getAccountRules(acc))
     res = []
     for r in rules:
         res.append(o.generateBrandClassifier(r))
     return res
Пример #21
0
 def getFanpageToCampaignsDict(cls):
     if not cls.fanpage_to_campaigns_max_age or not cls.cached_fanpage_to_campaigns or (datetime.now() - cls.cached_fanpage_to_campaigns['fetch_time'] > cls.fanpage_to_campaigns_max_age):        
         print "refetching fanpages to campagins dict"
         accounts = MongoManager.getActiveAccounts()
         data = {}
         for acc in accounts:
             for camp in acc.getActiveCampaigns():
                 for fp in camp.getFacebookFanpages():
                     if fp not in data: data[fp] = []
                     data[fp].append(camp)
         cls.cached_fanpage_to_campaigns = {'data': data, 'fetch_time': datetime.now()}
     return cls.cached_fanpage_to_campaigns['data']
Пример #22
0
 def start(self, **kwargs):
     only_campaign = kwargs.get('campaign', None)
     regenerate_all = kwargs.get('regenerate', False)
     while True:
         end = self.getCurrentSummarizationEnd()
         for account in MongoManager.getActiveAccounts(max_age=timedelta(
                 hours=1)):
             for campaign in account.getActiveCampaigns():
                 MongoManager.ensureIndex(
                     'summarized_tweets_%s' % campaign.getId(),
                     [("start", 1)])
                 if only_campaign and only_campaign.getId(
                 ) != campaign.getId():
                     continue
                 if regenerate_all:
                     self.clearSummarization(campaign)
                     collection_name = 'tweets_%s' % campaign.getId()
                     res = MongoManager.findTweets(collection_name,
                                                   sort=("x_created_at", 1),
                                                   limit=1)
                     if res.count():
                         lsd = res[0]['x_created_at'].replace(minute=0,
                                                              second=0,
                                                              microsecond=0)
                     else:
                         lsd = datetime.now().replace(minute=0,
                                                      second=0,
                                                      microsecond=0)
                 else:
                     lsd = self.getLastSummarizedDate(campaign)
                 if lsd < end:
                     while lsd < end:
                         self.summarize(campaign, lsd,
                                        min(end, lsd + timedelta(days=1)),
                                        timedelta(hours=1), None)
                         lsd = lsd + timedelta(days=1)
         pprint("sleeping 20 seconds")
         regenerate_all = False
         time.sleep(20)
Пример #23
0
 def getNamesDatabase(cls, **kwargs):
     max_age = kwargs.get('max_age', timedelta(seconds=0))
     if not max_age or not cls.cached_names_database or (
             datetime.now() - cls.cached_names_database['fetch_time'] >
             max_age):
         namesdb = MongoManager.find("gender_names")
         res = {}
         for name in namesdb:
             res[name["name"].lower()] = name["gender"]
         cls.cached_names_database = {
             'data': res,
             'fetch_time': datetime.now()
         }
     return cls.cached_names_database['data']
Пример #24
0
 def getGlobalTrendStopWords(cls, language, **kwargs):
     max_age = kwargs.get('max_age', timedelta(seconds=0))
     if not max_age or not cls.global_trend_stop_words.get(
             language,
             None) or (datetime.now() -
                       cls.global_trend_stop_words[language]['fetch_time'] >
                       max_age):
         cls.global_trend_stop_words[language] = {
             'data':
             set(MongoManager.getGlobalTrendStopWords(language)['words']),
             'fetch_time':
             datetime.now()
         }
     return cls.global_trend_stop_words[language]['data']
Пример #25
0
 def getTopicClassifiers(cls):
     #faltaria buffer por max_age
     #devuelve un diccionario con los topics x campania
     o = cls()
     res = {}
     accounts = MongoManager.getActiveAccounts(max_age=timedelta(seconds=10))
     for acc in accounts:
         for campaign in acc.getActiveCampaigns():
             topics = campaign.getTopics()
             if not topics: continue
             res[campaign.getId()] = {}
             for topic in topics:
                 #topic['_id'] = topic.getId() ###ESTO VA???
                 res[campaign.getId()][topic.getId()] = o.generateTopicClassifier(topic)
     return res
Пример #26
0
    def run(self):
        year, month = self.findFirstMonth()
        #year = 2015
        #month = 2
        if not year or not month: return
        d = datetime(year, month, 1)
        while not self.finish_flag and d <= datetime.now():

            feed = self.getFeed(self.url + "/%s/%s/%s/feed" %
                                (d.year, d.month, d.day))
            for entry in feed.entries:
                if self.finish_flag: break
                if entry.slash_comments > 0:
                    comments_feed = self.getFeed(entry.wfw_commentrss)
                    if comments_feed:
                        for comment_entry in comments_feed.entries:
                            fe = FeedEntry.fromFeedParserEntry(
                                comments_feed.feed.link, comment_entry)
                            fe.account = self.account
                            fe.campaign = self.campaign
                            self.queue.put(fe)

            d = d + timedelta(days=1)
            if d.day == 1:  #cambio el mes, me fijo que si posts en el nuevo mes
                while not self.finish_flag and datetime.now():
                    dummy_feed = self.getFeed(self.url + "/%s/%s/feed" %
                                              (d.year, d.month))
                    if dummy_feed.entries:
                        break
                    d = (d + timedelta(days=32)).replace(day=1)  #agrego 1 mes

        if d > datetime.now():
            acc = MongoManager.getAccount(id=self.account.getId())
            camp = acc.getCampaign(id=self.campaign.getId())
            camp.addHistoryFetchedForum(self.url)
            MongoManager.saveCampaign(acc, camp)
Пример #27
0
 def getSummarizedData(self, campaign, start, end):
     collection_name = 'summarized_tweets_%s' % campaign.getId()
     #print 41, datetime.now()
     res = MongoManager.find(collection_name, filters={'start': {"$gte": start, "$lte": end}, 'end': {"$lte": end}}, sort=('start',1))
     #pprint(res.explain())
     #print 43, datetime.now()
     #timerange = [SumDict(r) for r in res]
     timerange = list(res)
     #print 44, datetime.now()
     #for r in timerange:
     #    print r['start'], r['end'], r['stats']['total_tweets'], r['sentiment'], r.get('calculated', '')
     #print 44, datetime.now()
     #print timerange[-1]['end'], end
     if timerange and timerange[-1]['end'] < end:
         d = self.calculateSummarizedIntervals(campaign, timerange[-1]['end'], end, end - timerange[-1]['end'])
         #for k in d:
         #    k['calculated'] = True
         timerange.extend(d)
     #for r in timerange:
     #    print r['start'], r['end'], r['stats']['total_tweets'], r['sentiment'], r.get('calculated', '')
     #print 45, datetime.now()
     return timerange
Пример #28
0
    def processItem(self, feed):
        #pprint (feed)
        #pprint (tweet.getExtractedInfo())
        bcs = ClassifierManager.getCampaignBrandClassifiers(
            feed.account, feed.campaign
        )  #esto tendria que esta cacheado tambien en classifiermanager
        tcs = None
        pms = self.getBrandClassifiersByCampaign(
            feed.getText(), bcs
        )  ##FALTA AGREGAR TAMBIEN A LOS TWEETS QUE NO MATCHEAN PERO QUE SON DE UN USUARIO SEGUIDO POR LA MARCA
        #print "processing feed:", feed
        for cid, pmlist in pms.items():
            if tcs is None:
                tcs = ClassifierManager.getCampaignTopicClassifiers(
                    feed.campaign)
            tms = self.getTopicClassifiers(feed.getText(), cid, tcs)
            feed.setExtractedTopics(tms)
            feed.setExtractedInfo(pmlist)
        if not self.APPLY_BRAND_FILTERS or feed.getExtractedInfo():
            mongores = MongoManager.saveDocument(
                "feeds_%s" % feed.campaign.getId(), feed.getDictionary())
            #print "mongo result: ", mongores

        return None  #no devuelvo nada para que no se acumulen los feeds en la ultima lista y se sature la memoria
Пример #29
0
from mongo import MongoManager
mongo_mgr = MongoManager("mongodb://*****:*****@192.168.1.14:27017/stock")

l = mongo_mgr.get_collection_names('stock')
for collection_name in l:
    if "DailyInfo_" in collection_name:
        print(collection_name)
        #mongo_mgr.drop_collection('stock', collection_name)
Пример #30
0
 def clearAllSummarizedData(self, campaign_id): #esto esta repetido!
     collection_name = 'summarized_tweets_%s' % campaign_id
     #print 41, datetime.now()
     res = MongoManager.remove(collection_name, filters={})
Пример #31
0
    def calculateSummarizedIntervals(self, campaign, start, end, interval, tweetlist=None):
        pprint("summarizing tweets for campaign %s between %s and %s" % (campaign.getName(), start, end))
        synonyms = self.getTrendWordsSynonyms(campaign)
        trend_stop_words_set = self.getTrendStopWords(campaign)
        collection_name = 'summarized_tweets_%s' % campaign.getId()
        if tweetlist is None:
            tweetlist = MongoManager.findTweets("tweets_%s" % campaign.getId(), filters={"retweeted_status": {"$exists": False}, "x_created_at": {"$gte": start, "$lte": end}})
        own_fa = campaign.getOwnFollowAccounts()
        timerange = []
        d = start
        while d < end:
            data = SumDict({'start': d, 'end': d+interval})
            data['stats'] = SumDict()
            data['stats']['total_tweets'] = 0
            data['stats']['own_tweets'] = SumDict({'total': 0, 'accounts': SumDict([(a,0) for a in own_fa])})
            data['stats']['own_tweets']['retweets']  = SumDict({'total': 0, 'accounts': SumDict([(a,0) for a in own_fa])})
            data['stats']['own_tweets']['favorites']  = SumDict({'total': 0, 'accounts': SumDict([(a,0) for a in own_fa])})
            data['stats']['mentions']  = SumDict({'total': 0, 'accounts': SumDict([(a,0) for a in own_fa])})
            data['sentiment'] = SumDict()
            data['brand'] = SumDict()
            data['product'] = SumDict()
            data['topic'] = SumDict()
            data['gender'] = SumDict()
            data['words'] = SumDict()
            timerange.append(data)
            d = d + interval
            
        for t in tweetlist:
            for interv in timerange:
                if t.getCreatedDate() >= interv['start'] and t.getCreatedDate() < interv['end']:
                    interv['stats']['total_tweets'] += 1
                    if t.getUsername() in own_fa:
                        interv['stats']['own_tweets']['total'] += 1
                        interv['stats']['own_tweets']['accounts'][t.getUsername()] += 1                    
                        interv['stats']['own_tweets']['retweets']['total'] += t.getRetweetsCount()
                        interv['stats']['own_tweets']['retweets']['accounts'][t.getUsername()] += t.getRetweetsCount()
                        interv['stats']['own_tweets']['favorites']['total'] += t.getFavoritesCount()
                        interv['stats']['own_tweets']['favorites']['accounts'][t.getUsername()] += t.getRetweetsCount()
                    for k,v in t.getFollowAccountsMentionCount().items():
                        if k in own_fa:
                            interv['stats']['mentions']['total'] += 1
                            interv['stats']['mentions']['accounts'][k] += 1
                    if t.getSentiment():
                        if not t.getSentiment() in interv['sentiment']: interv['sentiment'][t.getSentiment()] = {"total": 0}
                        interv['sentiment'][t.getSentiment()]['total'] += 1
                    pms = t.getExtractedInfo()
                    if pms:
                        pm = pms[0]
                        try:
                            interv['brand'][pm['brand']] += 1
                        except KeyError,e: 
                            interv['brand'][pm['brand']] = 1
                        if pm['product']: 
                            p = pm['brand'] + "/" + pm['product']
                            try:
                                interv['product'][p] += 1
                            except KeyError, e:
                                interv['product'][p] = 1
                    topics = t.getExtractedTopics()
                    if topics is None: topics = []
                    for k in topics:
                        try:
                            interv['topic'][k['topic_name']]['total'] += 1
                        except KeyError, e:
                            interv['topic'][k['topic_name']] = {'total': 1}
                    for word in self.getWordsList(t.getText()):
                        if word in trend_stop_words_set: continue
                        word = word.lower()
                        nword = synonyms.get(word, word)
                        data['words'][nword] = data['words'].get(nword, 0) + 1

                    gender = t.getGender()
                    try:
                        interv['gender'][gender]['total'] += 1
                    except KeyError, e:
                        interv['gender'][gender] = {'total': 1}
Пример #32
0
    def extractGender(cls, name):
        #nname = re.sub(ur'[_]+', u' ', name, flags=re.UNICODE)
        nname = re.sub(ur'[_\-\.]', u' ', name)
        nname = re.sub(ur'[^\w ]+', u'', nname)
        words = [w.lower() for w in name.split() if len(w) > 1]
        names = cls.getNamesDatabase(
            max_age=timedelta(seconds=300))  #5 minutes
        k = 100
        M = 0
        F = 0
        for w in words:
            g = names.get(w, "U")
            if g == "M": M += k
            elif g == "F": F += k
            k -= 1
        if M + F == 0: return "U"
        if M > F: return "M"
        return "F"


if __name__ == "__main__":
    print GenderClassifier.getNamesDatabase()
    tweets = MongoManager.findTweets("tweets_g1", limit=40)
    for t in tweets:
        g = GenderClassifier.extractGender(t.getDisplayName())
        print t.getDisplayName(), g

    for n in ("pablo romina XX", "romina pablo"):
        g = GenderClassifier.extractGender(n)
        print n, g
Пример #33
0
            
    


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--regenerate', action="store_true", default=False)
    parser.add_argument('--account', default=None)
    parser.add_argument('--list', action="store_true", default=False)
    parser.add_argument('--start', default=None)
    parser.add_argument('--end', default=None)
    parser.add_argument('--clear', action="store_true", default=False)
    args, known = parser.parse_known_args()
    campaign = None
    if args.account:
        account = MongoManager.getAccount(name=args.account)
        if not account:
            pprint("Account %s not found" % args.account)
            exit(1)
        campaign = account.getActiveCampaigns()[0]    
    
    summarizer = Summarizer()    
    if not args.list and not args.clear:
        summarizer.start(campaign=campaign, regenerate=args.regenerate)
    elif args.clear and campaign:
        summarizer.clearSummarization(campaign)
    elif args.list and campaign and args.start and args.end:
        print args
        start = datetime.strptime(args.start, "%Y-%m-%dT%H")
        end = datetime.strptime(args.end, "%Y-%m-%dT%H")
        records = summarizer.getSummarizedData(campaign,start,end)
Пример #34
0
from mongo import MongoManager

doc={
	"nome":"Ruben2",
	"email":"*****@*****.**"
}

print "insert item 1"
mng= MongoManager()
IDResult=mng.add("tests",doc)
print IDResult
print doc
del doc["_id"]
print "insert item 2"
IDResult=mng.add("tests",doc)
print IDResult
print "find item 2"

filterDoc ={"_id":IDResult}
resultDoc=mng.get("tests",filterDoc)
print resultDoc
print "find all items"
resultDoc= mng.get("tests")
print resultDoc
Пример #35
0
from datetime import timedelta, date, datetime
import pandas as pd
import csv
try:
    sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
    import global_func
    import define
    from mongo import MongoManager
    from define import DB_KEY as DB_KEY
except:
    import src.global_func
    import src.define
    from src.tools.mongo import MongoManager
    from src.define import DB_KEY as DB_KEY

mongo_mgr = MongoManager("mongodb://*****:*****@192.168.1.14:27017/stock")
LOG_ENABLE = True
def normalize_file(market_type:str, file_path:str):
    with open(file_path, "r+", encoding='utf8') as f:
        text = f.read()
        text_arr = [i.translate({ord(' '): None, ord('='):None}).rstrip(',') 
            for i in text.split('\n') 
                if (len(i.split('",')) >= 15 and len(i.split('",')) <= 17) or "代號" in i]
        if market_type == define.MarketType.TPEX:
            if len(text_arr) > 0:             
                if "代號" in text_arr[0]:
                    del text_arr[0] 
                if len(text_arr) > 0:
                    length = len(text_arr[0].split('",')) if text_arr != None and len(text_arr) > 0 else 0
                    if "證券代號" not in text_arr[0]:
                        if length == 15:
Пример #36
0
    def extractGender(cls, name):
        #nname = re.sub(ur'[_]+', u' ', name, flags=re.UNICODE)
        nname = re.sub(ur'[_\-\.]', u' ', name)
        nname = re.sub(ur'[^\w ]+', u'', nname)
        words = [w.lower() for w in name.split() if len(w) > 1]
        names = cls.getNamesDatabase(max_age = timedelta(seconds=300)) #5 minutes
        k = 100
        M = 0
        F = 0
        for w in words:
            g = names.get(w, "U")
            if g == "M": M += k
            elif g == "F": F += k
            k -=1
        if M+F == 0: return "U"
        if M>F: return "M"
        return "F"



if __name__ == "__main__":
    print GenderClassifier.getNamesDatabase()
    tweets = MongoManager.findTweets("tweets_g1", limit=40)
    for t in tweets:
        g = GenderClassifier.extractGender(t.getDisplayName())
        print t.getDisplayName(), g

    for n in ("pablo romina XX", "romina pablo"):
        g = GenderClassifier.extractGender(n)
        print n, g
Пример #37
0
 def getGlobalTrendStopWords(cls, language,  **kwargs):
     max_age = kwargs.get('max_age', timedelta(seconds=0))
     if not max_age or not cls.global_trend_stop_words.get(language, None) or (datetime.now() - cls.global_trend_stop_words[language]['fetch_time'] > max_age):
         cls.global_trend_stop_words[language] = {'data': set(MongoManager.getGlobalTrendStopWords(language)['words']), 'fetch_time': datetime.now()}
     return cls.global_trend_stop_words[language]['data']
Пример #38
0
    def calculateSummarizedIntervals(self,
                                     campaign,
                                     start,
                                     end,
                                     interval,
                                     tweetlist=None):
        pprint("summarizing tweets for campaign %s between %s and %s" %
               (campaign.getName(), start, end))
        synonyms = self.getTrendWordsSynonyms(campaign)
        trend_stop_words_set = self.getTrendStopWords(campaign)
        collection_name = 'summarized_tweets_%s' % campaign.getId()
        if tweetlist is None:
            tweetlist = MongoManager.findTweets("tweets_%s" % campaign.getId(),
                                                filters={
                                                    "retweeted_status": {
                                                        "$exists": False
                                                    },
                                                    "x_created_at": {
                                                        "$gte": start,
                                                        "$lte": end
                                                    }
                                                })
        own_fa = campaign.getOwnFollowAccounts()
        timerange = []
        d = start
        while d < end:
            data = SumDict({'start': d, 'end': d + interval})
            data['stats'] = SumDict()
            data['stats']['total_tweets'] = 0
            data['stats']['own_tweets'] = SumDict({
                'total':
                0,
                'accounts':
                SumDict([(a, 0) for a in own_fa])
            })
            data['stats']['own_tweets']['retweets'] = SumDict({
                'total':
                0,
                'accounts':
                SumDict([(a, 0) for a in own_fa])
            })
            data['stats']['own_tweets']['favorites'] = SumDict({
                'total':
                0,
                'accounts':
                SumDict([(a, 0) for a in own_fa])
            })
            data['stats']['mentions'] = SumDict({
                'total':
                0,
                'accounts':
                SumDict([(a, 0) for a in own_fa])
            })
            data['sentiment'] = SumDict()
            data['brand'] = SumDict()
            data['product'] = SumDict()
            data['topic'] = SumDict()
            data['gender'] = SumDict()
            data['words'] = SumDict()
            timerange.append(data)
            d = d + interval

        for t in tweetlist:
            for interv in timerange:
                if t.getCreatedDate() >= interv['start'] and t.getCreatedDate(
                ) < interv['end']:
                    interv['stats']['total_tweets'] += 1
                    if t.getUsername() in own_fa:
                        interv['stats']['own_tweets']['total'] += 1
                        interv['stats']['own_tweets']['accounts'][
                            t.getUsername()] += 1
                        interv['stats']['own_tweets']['retweets'][
                            'total'] += t.getRetweetsCount()
                        interv['stats']['own_tweets']['retweets']['accounts'][
                            t.getUsername()] += t.getRetweetsCount()
                        interv['stats']['own_tweets']['favorites'][
                            'total'] += t.getFavoritesCount()
                        interv['stats']['own_tweets']['favorites']['accounts'][
                            t.getUsername()] += t.getRetweetsCount()
                    for k, v in t.getFollowAccountsMentionCount().items():
                        if k in own_fa:
                            interv['stats']['mentions']['total'] += 1
                            interv['stats']['mentions']['accounts'][k] += 1
                    if t.getSentiment():
                        if not t.getSentiment() in interv['sentiment']:
                            interv['sentiment'][t.getSentiment()] = {
                                "total": 0
                            }
                        interv['sentiment'][t.getSentiment()]['total'] += 1
                    pms = t.getExtractedInfo()
                    if pms:
                        pm = pms[0]
                        try:
                            interv['brand'][pm['brand']] += 1
                        except KeyError, e:
                            interv['brand'][pm['brand']] = 1
                        if pm['product']:
                            p = pm['brand'] + "/" + pm['product']
                            try:
                                interv['product'][p] += 1
                            except KeyError, e:
                                interv['product'][p] = 1
                    topics = t.getExtractedTopics()
                    if topics is None: topics = []
                    for k in topics:
                        try:
                            interv['topic'][k['topic_name']]['total'] += 1
                        except KeyError, e:
                            interv['topic'][k['topic_name']] = {'total': 1}
                    for word in self.getWordsList(t.getText()):
                        if word in trend_stop_words_set: continue
                        word = word.lower()
                        nword = synonyms.get(word, word)
                        data['words'][nword] = data['words'].get(nword, 0) + 1

                    gender = t.getGender()
                    try:
                        interv['gender'][gender]['total'] += 1
                    except KeyError, e:
                        interv['gender'][gender] = {'total': 1}
Пример #39
0
 def clearAllSummarizedData(self, campaign_id):  #esto esta repetido!
     collection_name = 'summarized_tweets_%s' % campaign_id
     #print 41, datetime.now()
     res = MongoManager.remove(collection_name, filters={})
Пример #40
0
 def generateGnipRulesFromMongo(self):
     accounts = MongoManager.getActiveAccounts()
     rules = []
     for acc in accounts:
         for camp in acc.getActiveCampaigns():
             for brand in camp.getBrands():
                 fa = sorted(brand.getFollowAccounts())
                 if fa:
                     rules.append({
                         "value":
                         " OR ".join(fa),
                         "tag":
                         "%s/%s/%s/follow accounts - mention" %
                         (acc.getName(), camp.getName(), brand.getName())
                     })
                     clean_user_names = [x.replace("@", "") for x in fa]
                     rules.append({
                         "value":
                         " OR ".join(
                             ["from:%s" % x for x in clean_user_names]),
                         "tag":
                         "%s/%s/%s/follow accounts - from" %
                         (acc.getName(), camp.getName(), brand.getName())
                     })
                     rules.append({
                         "value":
                         " OR ".join(
                             ["to:%s" % x for x in clean_user_names]),
                         "tag":
                         "%s/%s/%s/follow accounts - to" %
                         (acc.getName(), camp.getName(), brand.getName())
                     })
                 #BRAND RULES
                 for brule in brand.getIdentificationRules():
                     brule = brule.replace("[m]",
                                           "[M]").replace("[p]", "[P]")
                     for bsearch_keyword in brand.getSearchKeywords():
                         brand_replaced_rule = '"' + brule.replace(
                             "[M]", bsearch_keyword) + '"'
                         if (brule.upper().find("[P]") >= 0):
                             for product in brand.getProducts():
                                 if product.isUsingBrandIdRules():
                                     for psearch_keyword in product.getSearchKeywords(
                                     ):
                                         product_replaced_rule = brand_replaced_rule.replace(
                                             "[P]", psearch_keyword)
                                         rules.append({
                                             "value":
                                             product_replaced_rule,
                                             "tag":
                                             "%s/%s/%s/%s: %s" %
                                             (acc.getName(), camp.getName(),
                                              brand.getName(),
                                              product.getName(), brule)
                                         })
                         else:
                             rules.append({
                                 "value":
                                 brand_replaced_rule,
                                 "tag":
                                 "%s/%s/%s: %s" %
                                 (acc.getName(), camp.getName(),
                                  brand.getName(), brule)
                             })
                 #PRODUCT RULES
                 for product in brand.getProducts():
                     for prule in product.getIdentificationRules():
                         prule = prule.replace("[m]",
                                               "[M]").replace("[p]", "[P]")
                         for bsearch_keyword in brand.getSearchKeywords():
                             brand_replaced_rule = '"' + prule.replace(
                                 "[M]", bsearch_keyword) + '"'
                             for psearch_keyword in product.getSearchKeywords(
                             ):
                                 product_replaced_rule = brand_replaced_rule.replace(
                                     "[P]", psearch_keyword)
                                 rules.append({
                                     "value":
                                     product_replaced_rule,
                                     "tag":
                                     "%s/%s/%s/%s: %s" %
                                     (acc.getName(), camp.getName(),
                                      brand.getName(), product.getName(),
                                      prule)
                                 })
         for poll in acc.getActivePolls():
             rules.append({
                 "value":
                 " OR ".join(sorted(poll.getSearchHashtags())),
                 "tag":
                 "%s/poll %s" % (acc.getName(), poll.getName())
             })
     return rules
Пример #41
0
 def clearSummarization(self, campaign):
     MongoManager.remove('summarized_tweets_%s' % campaign.getId())
Пример #42
0
 def clearSummarization(self, campaign):
     MongoManager.remove('summarized_tweets_%s' % campaign.getId())
Пример #43
0
def getUser(user):
	doc={"_id":user}
	mng = MongoManager()
	dbResult=mng.get("tests",doc) 
	response.set_header("Content-Type:","text/json")
Пример #44
0
        return res
        """


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--regenerate', action="store_true", default=False)
    parser.add_argument('--account', default=None)
    parser.add_argument('--list', action="store_true", default=False)
    parser.add_argument('--start', default=None)
    parser.add_argument('--end', default=None)
    parser.add_argument('--clear', action="store_true", default=False)
    args, known = parser.parse_known_args()
    campaign = None
    if args.account:
        account = MongoManager.getAccount(name=args.account)
        if not account:
            pprint("Account %s not found" % args.account)
            exit(1)
        campaign = account.getActiveCampaigns()[0]

    summarizer = Summarizer()
    if not args.list and not args.clear:
        summarizer.start(campaign=campaign, regenerate=args.regenerate)
    elif args.clear and campaign:
        summarizer.clearSummarization(campaign)
    elif args.list and campaign and args.start and args.end:
        print args
        start = datetime.strptime(args.start, "%Y-%m-%dT%H")
        end = datetime.strptime(args.end, "%Y-%m-%dT%H")
        records = summarizer.getSummarizedData(campaign, start, end)