Пример #1
0
def bulkJsonData(json_file, _index, whatStuff):
    json_list = c.getDataFromFile(json_file)
    for doc in json_list:
        # use a 'yield' generator so that the data isn't loaded into memory
        if '{"index"' not in doc:
            json_doc = json.loads(doc)

            if whatStuff == "Email Addresses":
                my_text = json_doc["Email Address"]
                json_doc.update([("term", my_text)])
                json_doc.update([("key", "email")])

            if whatStuff == "PhoneNumbers":
                my_text2 = json_doc["Number"]
                json_doc.update([("term", my_text2)])
                json_doc.update([("key", "phone")])

            if whatStuff == "Profile_fixed":
                my_text = json_doc["term"]
                clean_my_text = c.cleanText(my_text)
                json_doc.update([("term", clean_my_text)])

                my_text2 = json_doc["key"]
                clean_my_text2 = c.cleanText(my_text2)
                json_doc.update([("key", clean_my_text2)])

            # add load_type, used later for filter
            json_doc.update([("load_type", whatStuff)])
            json_doc.update([("source_type", "linkedIn")])
            new_doc = str(json_doc).replace("'", '"')

            yield {"_index": _index, "_id": uuid.uuid4(), "_source": new_doc}
Пример #2
0
def bulkJsonData(json_file, _index, whatStuff):
    json_list = c.getDataFromFile(json_file)
    for doc in json_list:

        json_doc = json.loads(doc)

        # clean the text in comments and title from special character and emojies after json conversion
        for attachments in json_doc['attachments']:
            for dt in attachments['data']:
                for pl in dt['poll']['options']:
                    my_text = pl["option"]
                    clean_my_text = c.cleanText(my_text)
                    pl.update([("option", clean_my_text)])

                    my_vote = str(pl["voted"])
                    clean_my_vote = c.cleanText(my_vote)
                    pl.update([("voted", clean_my_vote)])

        my_title = json_doc["title"]
        clean_my_title = c.cleanText(my_title)
        json_doc.update([("title", clean_my_title)])

        # add load_type, used later for filter
        json_doc.update([("load_type", whatStuff)])
        json_doc.update([("source_type", "facebook")])
        new_doc = str(json_doc).replace("'", '"')

        # use a 'yield' generator so that the data isn't loaded into memory
        if '{"index"' not in new_doc:
            yield {"_index": _index, "_id": uuid.uuid4(), "_source": new_doc}
def bulkJsonData(json_file, _index, loadType):
    json_list = c.getDataFromFile(json_file)
    for doc in json_list:
        # use a 'yield' generator so that the data isn't loaded into memory
        if '{"index"' not in doc:

            json_doc = json.loads(doc)

            # clean the text in comments and title from special character and emojies after json conversion
            if loadType == 'pages':
                my_text = json_doc["name"]
                clean_my_text = c.cleanText(my_text)
                json_doc.update([("name", clean_my_text)])
            else:
                my_text = json_doc["title"]
                clean_my_text = c.cleanText(my_text)
                json_doc.update([("title", clean_my_text)])

            # add load_type, used later for filter
            json_doc.update([("load_type", loadType)])
            json_doc.update([("source_type", "facebook")])
            new_doc = str(json_doc).replace("'", '"')
            #print (new_doc)

            yield {"_index": _index, "_id": uuid.uuid4(), "_source": new_doc}
def bulkJsonData(json_file, _index,whatStuff):
		json_list = c.getDataFromFile(json_file)
		for doc in json_list:
				# use a 'yield' generator so that the data isn't loaded into memory
				if '{"index"' not in doc:
						yield {
								"_index": _index,
								"_id": uuid.uuid4(),
									"_source": doc
						}
Пример #5
0
def bulkJsonData(json_file, _index, whatStuff):
    json_list = c.getDataFromFile(json_file)
    for doc in json_list:
        # use a 'yield' generator so that the data isn't loaded into memory
        if '{"index"' not in doc:

            json_doc = json.loads(doc)

            # add load_type, used later for filter
            json_doc.update([("load_type", whatStuff)])
            json_doc.update([("source_type", "twitter")])
            new_doc = str(json_doc).replace("'", '"')
            #print (new_doc)

            yield {"_index": _index, "_id": uuid.uuid4(), "_source": new_doc}
def bulkJsonData(json_file, _index, whatStuff):
    json_list = c.getDataFromFile(json_file)
    for doc in json_list:

        json_doc = json.loads(doc)

        sentiment = [0, 0, 0]

        # clean the text in comments and title from special character and emojies after json conversion
        if "data" in json_doc:
            my_text_location = json_doc["data"][0]["comment"]
            my_text = my_text_location["comment"]

            #get sentiment
            sentiment = s.getSentiment(my_text)

            clean_my_text = c.cleanText(my_text)
            my_text_location.update([("comment", clean_my_text)])
            json_doc.update([("all_text", clean_my_text)])

            if "group" in my_text_location:
                my_group = my_text_location["group"]
                clean_my_group = c.cleanText(my_group)
                my_text_location.update([("group", clean_my_group)])

        my_title = json_doc["title"]
        clean_my_title = c.cleanText(my_title)
        json_doc.update([("title", clean_my_title)])

        # add sentiment
        json_doc.update([("mySentiment", sentiment[0])])
        json_doc.update([("sentPositive", sentiment[1])])
        json_doc.update([("sentNegative", sentiment[2])])

        # add load_type, used later for filter
        json_doc.update([("load_type", whatStuff)])
        json_doc.update([("source_type", "facebook")])
        new_doc = str(json_doc).replace("'", '"')

        # use a 'yield' generator so that the data isn't loaded into memory
        if '{"index"' not in new_doc:
            yield {"_index": _index, "_id": uuid.uuid4(), "_source": new_doc}
Пример #7
0
def bulkJsonData(json_file, _index,whatStuff):
	json_list = c.getDataFromFile(json_file)
	for doc in json_list:
		# use a 'yield' generator so that the data isn't loaded into memory
		if '{"index"' not in doc:

			json_doc = json.loads(doc)

			# clean the text in comments and title from special character and emojies after json conversion
			my_text = json_doc["First Name"]
			clean_my_text = c.cleanText(my_text)
			json_doc.update([ ("First Name", clean_my_text) ])	

			my_text2 = json_doc["Last Name"]
			clean_my_text2 = c.cleanText(my_text2)
			json_doc.update([ ("Last Name", clean_my_text2) ])	

			my_text3 = json_doc["Company"]
			clean_my_text3 = c.cleanText(my_text3)
			json_doc.update([ ("Company", clean_my_text3) ])	

			my_text4 = json_doc["Position"]
			clean_my_text4 = c.cleanText(my_text4)
			json_doc.update([ ("Position", clean_my_text4) ])	
		

			# add load_type, used later for filter
			json_doc.update([ ("load_type", whatStuff) ]) 
			json_doc.update([ ("source_type", "linkedIn") ])
			new_doc = str(json_doc).replace("'", '"')
			#print (new_doc)

			yield {
				"_index": _index,
				"_id": uuid.uuid4(),
				 "_source": new_doc
			}
Пример #8
0
def bulkJsonData(json_file, _index, whatStuff):
	json_list = c.getDataFromFile(json_file)
	for doc in json_list:

		json_doc = json.loads(doc)

		sentiment=[0,0,0]
		
		# use a 'yield' generator so that the data isn't loaded into memory
		if '{"index"' not in doc:

			# clean the text in comments and title from special character and emojies after json conversion
			if 'data' in json_doc:
				for dt in json_doc['data']:
					if 'post' in dt:
						my_text = dt["post"]

						#get sentiment
						sentiment = s.getSentiment(my_text)

						clean_my_text = c.cleanText(my_text)
						dt.update([ ("post", clean_my_text) ])
						json_doc.update([ ("all_text", clean_my_text) ])	

			if 'attachments' in json_doc:
				for att in json_doc['attachments']:
					if 'data' in att:
						for dt in att['data']:
							if 'external_context' in dt:
								if 'name' in dt["external_context"]:
									my_text2 = dt["external_context"]["name"]
									clean_my_text2 = c.cleanText(my_text2)
									dt["external_context"].update([ ("name", clean_my_text2) ])	 

							if 'media' in dt:
								my_title2 = dt['media']['title']
								clean_my_title2 = c.cleanText(my_title2)
								dt['media'].update([ ("title", clean_my_title2) ])	 

								if 'description' in dt['media']:
									my_description = dt['media']["description"]
									clean_my_description = c.cleanText(my_description)
									dt['media'].update([ ("description", clean_my_description) ])

							if 'place' in dt:
								my_loc = dt["place"]["coordinate"]
								my_lat = my_loc["latitude"]
								my_lon = my_loc["longitude"]
								new_my_loc = [my_lon,my_lat]
								dt["place"].update([ ("location", new_my_loc) ])
			 		
			if 'title' in json_doc:
				my_title = json_doc["title"]
				clean_my_title = c.cleanText(my_title)
				json_doc.update([ ("title", clean_my_title) ])	

			# add sentiment
			json_doc.update([ ("mySentiment", sentiment[0]) ]) 
			json_doc.update([ ("sentPositive", sentiment[1]) ]) 
			json_doc.update([ ("sentNegative", sentiment[2]) ]) 

			# add load_type, used later for filter
			json_doc.update([ ("load_type", whatStuff) ]) 
			json_doc.update([ ("source_type", "facebook") ])
			new_doc = str(json_doc).replace("'", '"')


			yield {
				"_index": _index,
				"_id": uuid.uuid4(),
				 "_source": new_doc
			}
def bulkJsonData(json_file, _index, whatStuff):
    json_list = c.getDataFromFile(json_file)
    for doc in json_list:
        # use a 'yield' generator so that the data isn't loaded into memory
        if '{"index"' not in doc:

            json_doc1 = json.loads(doc)

            sentiment = [0, 0, 0]

            if 'tweet' in json_doc1:
                json_doc = json_doc1["tweet"]
            else:
                json_doc = json_doc1

            my_text = json_doc["full_text"]

            #get sentiment
            if not my_text.startswith("RT"):
                sentiment = s.getSentiment(my_text)

            clean_my_text = c.cleanText(my_text)
            json_doc.update([("full_text", clean_my_text)])

            my_text2 = json_doc["source"]
            clean_my_text2 = c.cleanText(my_text2)
            json_doc.update([("source", clean_my_text2)])

            # Does not like "False", needed to be "false" !!!
            #my_text1 = json_doc["retweeted"]
            #clean_my_text1 = c.cleanText(str(my_text1))
            #json_doc.update([ ("retweeted", clean_my_text1) ])

            #if 'truncated' in json_doc:
            #  my_text3 = json_doc["truncated"]
            #  clean_my_text3 = c.cleanText(str(my_text3))
            #  json_doc.update([ ("truncated", clean_my_text3) ])

            #if 'favorited' in json_doc:
            #  my_text4 = json_doc["favorited"]
            #  clean_my_text4 = c.cleanText(str(my_text4))
            #  json_doc.update([ ("favorited", clean_my_text4) ])

            #if 'possibly_sensitive' in json_doc:
            #  my_text5 = json_doc["possibly_sensitive"]
            #  clean_my_text5 = c.cleanText(str(my_text5))
            #  json_doc.update([ ("possibly_sensitive", clean_my_text5) ])

        if 'in_reply_to_screen_name' in json_doc:
            my_name = json_doc["in_reply_to_screen_name"]
            clean_my_name = c.cleanText(my_name)
            json_doc.update([("in_reply_to_screen_name", clean_my_name)])

        if 'user_mentions' in json_doc["entities"]:
            for usr in json_doc["entities"]['user_mentions']:
                my_name1 = usr["name"]
                clean_my_name1 = c.cleanText(my_name1)
                usr.update([("name", clean_my_name1)])

                my_name2 = usr["screen_name"]
                clean_my_name2 = c.cleanText(my_name2)
                usr.update([("screen_name", clean_my_name2)])

            for usr in json_doc["entities"]['urls']:
                my_name3 = usr["url"]
                clean_my_name3 = c.cleanText(my_name3)
                usr.update([("url", clean_my_name3)])

                my_name4 = usr["expanded_url"]
                clean_my_name4 = c.cleanText(my_name4)
                usr.update([("expanded_url", clean_my_name4)])

                my_name5 = usr["display_url"]
                clean_my_name5 = c.cleanText(my_name5)
                usr.update([("display_url", clean_my_name5)])

        #if 'media' in json_doc:
        #  my_media = json_doc["media"]
        #  if 'additional_media_info' in my_media:
        #    my_name6 = my_media["additional_media_info"]
        #    clean_my_name6 = c.cleanText(my_name6)
        #    my_media.update([ ("additional_media_info", clean_my_name6) ])

        # add sentiment
            json_doc.update([("mySentiment", sentiment[0])])
            json_doc.update([("sentPositive", sentiment[1])])
            json_doc.update([("sentNegative", sentiment[2])])

            # add load_type, used later for filter
            json_doc.update([("load_type", whatStuff)])
            json_doc.update([("source_type", "twitter")])
            new_doc = str(json_doc).replace("'", '"')
            #print (new_doc)

            new_doc = new_doc.replace("False", "false")
            new_doc = new_doc.replace("True", "true")

            yield {"_index": _index, "_id": uuid.uuid4(), "_source": new_doc}