Пример #1
0
 def process_text(self):
     text = utils.clean(self.get_tweet_text())
     self.set_tweet_text(text)
     self.set_tweet_source(utils.parse_alink(self.get_tweet_source()))
     if self.translation:
         self.detect_language_or_translate()
     self.filter_text()
Пример #2
0
    def output_tweets_in_file(self, result):
        keywords = ""
        points = ""
        words = True
        uid = result['user']['id']
        if self.geo:
            if self.geowrite:
                if not type(result["geo"]).__name__ == 'NoneType':
                    lat = 0.00
                    long = 0.00

                    while len(result["geo"]["coordinates"]) != 0:
                        if len(result["geo"]["coordinates"]) == 2:
                            lat = str(result["geo"]["coordinates"].pop(0))
                        else:
                            long = str(result["geo"]["coordinates"].pop(0))
                    points = "%s %s" % (lat, long)

        if self.userwrite:
            user = ""
            if self.userfields:
                i = 0
                ttl = len(self.userfields)
                for field in self.userfields:
                    if field == 'screenname' or field == 'name':
                        user += "\"%s\"" % result["user"][field].encode(
                            "UTF-8")
                    elif field == 'description' or field == 'time_zone':
                        user += "\"%s\"" % utils.clean(
                            result["user"][field].encode("ASCII", "ignore"))
                    elif field == 'created_at':
                        created = result["user"][field].encode("UTF-8")
                        date = parse(created)
                        created = date.strftime("%Y-%m-%d %H:%M:%S")
                        user += "\"%s\"" % created
                    elif not field == 'id':
                        if result["user"][field] != "":
                            user += "%s" % result["user"][field]
                        else:
                            user += "0"
                    if i < ttl:
                        user += ","
                    i += 1

        if self.tweetwrite:
            tweet = ""
            if self.tweetfields:
                i = 0
                ttl = len(self.tweetfields)
                for field in self.tweetfields:
                    if field == 'source':
                        source = utils.parse_alink(result[field])
                        tweet += "\"%s\"" % source.encode("UTF-8")
                    elif field == 'created_at':
                        created = result[field].encode("UTF-8")
                        date = parse(created)
                        created = date.strftime("%Y-%m-%d %H:%M:%S")
                        tweet += "\"%s\"" % created
                    elif field == 'tokens':
                        list(result[field]).sort()
                        for token in result[field]:
                            keywords += token.lower() + " "
                        keywords = keywords.rstrip().encode("UTF-8")
                        if keywords == "":
                            words = False
                        tweet += "\"%s\"" % keywords
                    elif field == 'text':
                        text = utils.clean(result[field])
                        tweet += "\"%s\"" % text.encode("UTF-8")
                    elif field == 'retweet_count':
                        tweet += "\"%s\"" % str(
                            result[field]).encode("UTF-8").replace("+", "")

                    i += 1
                    if i < ttl:
                        tweet += ","

        if points != "":
            if keywords == "":
                field = "tokens"
                list(result[field]).sort()
                for token in result[field]:
                    keywords += token.lower() + " "
                keywords = keywords.rstrip().encode("UTF-8")
            geo_data = "%s,\"%s\",\"%s\"" % (uid, points, keywords)
            self.output_data_file(self.out_geo_file, geo_data)

        if user != "":
            user_data = "%s,%s" % (uid, user)
            self.output_data_file(self.out_user_file, user_data)

        if tweet != "":
            tweets_data = "%s,%s" % (uid, tweet)
            self.output_data_file(self.out_tweets_file, tweets_data)

        if self.wordswrite:
            if words:
                if keywords == "":
                    field = "tokens"
                    list(result[field]).sort()
                    for token in result[field]:
                        keywords += token.lower() + " "
                    keywords = keywords.rstrip().encode("UTF-8")
                words_data = "\"%s\"" % keywords
                self.output_data_file(self.out_words_file, words_data)
Пример #3
0
    def output_tweets_in_file(self,result):
        keywords=""
        points = ""
        words = True
        uid = result['user']['id']
        if self.geo:
            if self.geowrite:
                if not type(result["geo"]).__name__  == 'NoneType':
                    lat=0.00
                    long=0.00

                    while len(result["geo"]["coordinates"]) != 0:
                        if len(result["geo"]["coordinates"]) == 2:
                            lat = str(result["geo"]["coordinates"].pop(0))
                        else:
                            long = str(result["geo"]["coordinates"].pop(0))
                    points = "%s %s" % (lat,long)
        
        if self.userwrite:
            user = ""
            if self.userfields:
                i = 0
                ttl = len(self.userfields)
                for field in self.userfields:
                    if field == 'screenname' or field== 'name':
                        user += "\"%s\"" % result["user"][field].encode("UTF-8")
                    elif field== 'description' or field == 'time_zone':
                        user += "\"%s\"" % utils.clean(result["user"][field].encode("ASCII","ignore"))
                    elif field =='created_at':
                        created = result["user"][field].encode("UTF-8")
                        date = parse(created)
                        created = date.strftime("%Y-%m-%d %H:%M:%S")
                        user += "\"%s\"" % created
                    elif not field == 'id':
                        if result["user"][field] != "":
                            user += "%s" % result["user"][field]
                        else:
                            user += "0" 
                    if i < ttl:
                        user += ","
                    i += 1

        if self.tweetwrite:
            tweet = ""
            if self.tweetfields:
                i = 0
                ttl = len(self.tweetfields)
                for field in self.tweetfields:
                    if field == 'source':
                        source = utils.parse_alink(result[field])
                        tweet += "\"%s\"" % source.encode("UTF-8")
                    elif field == 'created_at':
                        created = result[field].encode("UTF-8")
                        date = parse(created)
                        created = date.strftime("%Y-%m-%d %H:%M:%S")
                        tweet += "\"%s\"" % created
                    elif field == 'tokens':
                        list(result[field]).sort()
                        for token in result[field]:
                            keywords += token.lower()+" "
                        keywords = keywords.rstrip().encode("UTF-8")
                        if keywords == "":
                            words = False
                        tweet += "\"%s\"" % keywords
                    elif field == 'text':
                        text = utils.clean(result[field])
                        tweet += "\"%s\"" % text.encode("UTF-8")
                    elif field == 'retweet_count':
                        tweet += "\"%s\"" % str(result[field]).encode("UTF-8").replace("+","")
                        
                    i += 1
                    if i < ttl:
                        tweet += ","
        
        if points != "":
            if keywords == "":
                field = "tokens"
                list(result[field]).sort()
                for token in result[field]:
                    keywords += token.lower()+" "
                keywords = keywords.rstrip().encode("UTF-8")
            geo_data = "%s,\"%s\",\"%s\"" % (uid,points,keywords)
            self.output_data_file(self.out_geo_file, geo_data)
        
        if user != "":
            user_data = "%s,%s" % (uid,user)
            self.output_data_file(self.out_user_file, user_data)
            
        if tweet != "":
            tweets_data = "%s,%s" % (uid,tweet)
            self.output_data_file(self.out_tweets_file, tweets_data)
            
        if self.wordswrite:
            if words:
                if keywords == "":
                    field = "tokens"
                    list(result[field]).sort()
                    for token in result[field]:
                        keywords += token.lower()+" "
                    keywords = keywords.rstrip().encode("UTF-8")
                words_data = "\"%s\"" % keywords
                self.output_data_file(self.out_words_file,words_data)