def batch_generator(twitterFile, goldstandard, batch_size=64): while True: #TODO: needed? with gzip.open(twitterFile, 'rb') as file: trainDescriptions = [] trainLinks = [] trainLocation = [] trainSource = [] trainTexts = [] trainUserName = [] trainTZ = [] trainUtc = [] trainUserLang = [] trainCreatedAt = [] trainUserMentions = [] trainLabels = [] for line in file: if len(trainDescriptions) == batch_size: trainDescriptions = [] trainLinks = [] trainLocation = [] trainSource = [] trainTexts = [] trainUserName = [] trainTZ = [] trainUtc = [] trainUserLang = [] trainCreatedAt = [] trainUserMentions = [] trainLabels = [] instance = parseJsonLine(line.decode('utf-8')) trainDescriptions.append(str(instance.description)) trainLinks.append(extractPreprocessUrl(instance.urls)) trainLocation.append(str(instance.location)) trainSource.append(str(instance.source)) trainTexts.append(instance.text) trainUserName.append(str(instance.name)) trainTZ.append(str(instance.timezone)) trainUtc.append(str(instance.utcOffset)) trainUserLang.append(str(instance.userLanguage)) trainCreatedAt.append( str(instance.createdAt.hour) + "-" + str(roundMinutes(instance.createdAt.minute))) trainUserMentions.append(instance.userMentions) trainLabel = goldstandard[instance.id]._name trainLabels.append(trainLabel) #print(str(instance.id) +"\t" +str(len(trainDescriptions))) if len(trainDescriptions) == batch_size: #Descriptions trainDescriptions = descriptionTokenizer.texts_to_sequences( trainDescriptions) trainDescriptions = np.asarray( trainDescriptions) # Convert to ndArraytop trainDescriptions = pad_sequences( trainDescriptions, maxlen=MAX_DESC_SEQUENCE_LENGTH) # Link-Mentions trainDomain = list(map(lambda x: x[0], trainLinks)) # URL-Domain categorial = np.zeros( (len(trainDomain), len(domainEncoder.classes_)), dtype="bool") for i in range(len(trainDomain)): if trainDomain[i] in domainEncoder.classes_: categorial[i, domainEncoder. transform([trainDomain[i]])[0]] = True trainDomain = categorial trainTld = list( map(lambda x: x[1], trainLinks)) # Url suffix; top level domain categorial = np.zeros( (len(trainTld), len(tldEncoder.classes_)), dtype="bool") for i in range(len(trainTld)): if trainTld[i] in tldEncoder.classes_: categorial[ i, tldEncoder.transform([trainTld[i]])[0]] = True trainTld = categorial # Location trainLocation = locationTokenizer.texts_to_sequences( trainLocation) trainLocation = np.asarray( trainLocation) # Convert to ndArraytop trainLocation = pad_sequences( trainLocation, maxlen=MAX_LOC_SEQUENCE_LENGTH) # Source trainSource = sourceEncoder.transform(trainSource) categorial = np.zeros( (len(trainSource), len(sourceEncoder.classes_)), dtype="bool") for i in range(len(trainSource)): categorial[i, trainSource[i]] = True trainSource = categorial #Text Tweet trainTexts = textTokenizer.texts_to_sequences(trainTexts) trainTexts = np.asarray( trainTexts) # Convert to ndArraytop trainTexts = pad_sequences(trainTexts, maxlen=MAX_TEXT_SEQUENCE_LENGTH) #User Name trainUserName = nameTokenizer.texts_to_sequences( trainUserName) trainUserName = np.asarray( trainUserName) # Convert to ndArraytop trainUserName = pad_sequences( trainUserName, maxlen=MAX_NAME_SEQUENCE_LENGTH) #Time Zone trainTZ = timeZoneTokenizer.texts_to_sequences(trainTZ) trainTZ = np.asarray(trainTZ) # Convert to ndArraytop trainTZ = pad_sequences(trainTZ, maxlen=MAX_TZ_SEQUENCE_LENGTH) # UTC trainUtc = utcEncoder.transform(trainUtc) categorial = np.zeros( (len(trainUtc), len(utcEncoder.classes_)), dtype="bool") for i in range(len(trainUtc)): categorial[i, trainUtc[i]] = True trainUtc = categorial # User-Language (63 languages) trainUserLang = langEncoder.transform(trainUserLang) categorial = np.zeros( (len(trainUserLang), len(langEncoder.classes_)), dtype="bool") for i in range(len(trainUserLang)): categorial[i, trainUserLang[i]] = True trainUserLang = categorial # Tweet-Time (120 steps) trainCreatedAt = timeEncoder.transform(trainCreatedAt) categorial = np.zeros( (len(trainCreatedAt), len(timeEncoder.classes_)), dtype="bool") for i in range(len(trainCreatedAt)): categorial[i, trainCreatedAt[i]] = True trainCreatedAt = categorial # class label classes = classEncoder.transform(trainLabels) #yield trainDescriptions, classes yield ( { 'inputDescription': trainDescriptions, 'inputDomain': trainDomain, 'inputTld': trainTld, 'inputLocation': trainLocation, 'inputSource': trainSource, 'inputText': trainTexts, 'inputUser': trainUserName, 'inputTimeZone': trainTZ, 'inputUTC': trainUtc, 'inputUserLang': trainUserLang, 'inputTweetTime': trainCreatedAt }, #{'output': y} classes)
def text_generator(trainingFile): with gzip.open(trainingFile, 'rb') as file: for line in file: instance = parseJsonLine(line.decode('utf-8')) yield str(instance.text)
##Load test-data testDescription = [] testLinks = [] testLocations = [] testSource = [] testTexts = [] testUserName = [] testTimeZone = [] testUtc = [] testUserIds = [] testUserLang = [] testCreatedAt = [] f = open(testFile) for line in f: instance = parseJsonLine(line) testDescription.append(str(instance.description)) testLinks.append(extractPreprocessUrl(instance.urls)) testLocations.append(str(instance.location)) source = str(instance.source) testSource.append(source) testTexts.append(instance.text) testUserName.append(str(instance.name)) testTimeZone.append(str(instance.timezone)) testUtc.append(str(instance.utcOffset)) testUserLang.append(str(instance.userLanguage)) testCreatedAt.append( str(instance.createdAt.hour) + "-" + str(roundMinutes(instance.createdAt.minute)))
import numpy as np from representation import parseJsonLine, Place trainingFile = "data/train/training.twitter.json.gz" #File with all ~9 Million training tweets placesFile = 'data/train/training.json.gz' #Place annotation provided by task organisers testTweet = "data/test/test.tweet.json" # WNUT test file for tweets testUser = "******" # WNUT test file for User #Parse Twitter-JSON tweetToTextMapping = {} # Map<Twitter-ID; tweet> with gzip.open(trainingFile, 'rb') as file: for line in file: instance = parseJsonLine(line.decode('utf-8')) tweetToTextMapping[instance.id] = instance #Parse and add gold-label for tweets with gzip.open(placesFile, 'rb') as file: for line in file: parsed_json = json.loads(line.decode('utf-8')) tweetId = int(parsed_json["tweet_id"]) if (tweetId in tweetToTextMapping): place = Place(name=parsed_json["tweet_city"], lat=parsed_json["tweet_latitude"], lon=parsed_json["tweet_longitude"]) tweetToTextMapping[tweetId].place = place #Extract all place gold-labels for all tweets places = list(map(lambda x: x.place, tweetToTextMapping.values()))