def batch_generator(twitterFile, goldstandard, batch_size=64):
    while True:  #TODO: needed?
        with gzip.open(twitterFile, 'rb') as file:
            trainDescriptions = []
            trainLinks = []
            trainLocation = []
            trainSource = []
            trainTexts = []
            trainUserName = []
            trainTZ = []
            trainUtc = []
            trainUserLang = []
            trainCreatedAt = []
            trainUserMentions = []
            trainLabels = []
            for line in file:

                if len(trainDescriptions) == batch_size:
                    trainDescriptions = []
                    trainLinks = []
                    trainLocation = []
                    trainSource = []
                    trainTexts = []
                    trainUserName = []
                    trainTZ = []
                    trainUtc = []
                    trainUserLang = []
                    trainCreatedAt = []
                    trainUserMentions = []
                    trainLabels = []

                instance = parseJsonLine(line.decode('utf-8'))

                trainDescriptions.append(str(instance.description))
                trainLinks.append(extractPreprocessUrl(instance.urls))
                trainLocation.append(str(instance.location))
                trainSource.append(str(instance.source))
                trainTexts.append(instance.text)
                trainUserName.append(str(instance.name))
                trainTZ.append(str(instance.timezone))
                trainUtc.append(str(instance.utcOffset))
                trainUserLang.append(str(instance.userLanguage))
                trainCreatedAt.append(
                    str(instance.createdAt.hour) + "-" +
                    str(roundMinutes(instance.createdAt.minute)))
                trainUserMentions.append(instance.userMentions)

                trainLabel = goldstandard[instance.id]._name
                trainLabels.append(trainLabel)

                #print(str(instance.id) +"\t" +str(len(trainDescriptions)))

                if len(trainDescriptions) == batch_size:

                    #Descriptions
                    trainDescriptions = descriptionTokenizer.texts_to_sequences(
                        trainDescriptions)
                    trainDescriptions = np.asarray(
                        trainDescriptions)  # Convert to ndArraytop
                    trainDescriptions = pad_sequences(
                        trainDescriptions, maxlen=MAX_DESC_SEQUENCE_LENGTH)

                    # Link-Mentions
                    trainDomain = list(map(lambda x: x[0],
                                           trainLinks))  # URL-Domain
                    categorial = np.zeros(
                        (len(trainDomain), len(domainEncoder.classes_)),
                        dtype="bool")
                    for i in range(len(trainDomain)):
                        if trainDomain[i] in domainEncoder.classes_:
                            categorial[i,
                                       domainEncoder.
                                       transform([trainDomain[i]])[0]] = True
                    trainDomain = categorial

                    trainTld = list(
                        map(lambda x: x[1],
                            trainLinks))  # Url suffix; top level domain
                    categorial = np.zeros(
                        (len(trainTld), len(tldEncoder.classes_)),
                        dtype="bool")
                    for i in range(len(trainTld)):
                        if trainTld[i] in tldEncoder.classes_:
                            categorial[
                                i,
                                tldEncoder.transform([trainTld[i]])[0]] = True
                    trainTld = categorial

                    # Location
                    trainLocation = locationTokenizer.texts_to_sequences(
                        trainLocation)
                    trainLocation = np.asarray(
                        trainLocation)  # Convert to ndArraytop
                    trainLocation = pad_sequences(
                        trainLocation, maxlen=MAX_LOC_SEQUENCE_LENGTH)

                    # Source
                    trainSource = sourceEncoder.transform(trainSource)
                    categorial = np.zeros(
                        (len(trainSource), len(sourceEncoder.classes_)),
                        dtype="bool")
                    for i in range(len(trainSource)):
                        categorial[i, trainSource[i]] = True
                    trainSource = categorial

                    #Text Tweet
                    trainTexts = textTokenizer.texts_to_sequences(trainTexts)
                    trainTexts = np.asarray(
                        trainTexts)  # Convert to ndArraytop
                    trainTexts = pad_sequences(trainTexts,
                                               maxlen=MAX_TEXT_SEQUENCE_LENGTH)

                    #User Name
                    trainUserName = nameTokenizer.texts_to_sequences(
                        trainUserName)
                    trainUserName = np.asarray(
                        trainUserName)  # Convert to ndArraytop
                    trainUserName = pad_sequences(
                        trainUserName, maxlen=MAX_NAME_SEQUENCE_LENGTH)

                    #Time Zone
                    trainTZ = timeZoneTokenizer.texts_to_sequences(trainTZ)
                    trainTZ = np.asarray(trainTZ)  # Convert to ndArraytop
                    trainTZ = pad_sequences(trainTZ,
                                            maxlen=MAX_TZ_SEQUENCE_LENGTH)

                    # UTC
                    trainUtc = utcEncoder.transform(trainUtc)
                    categorial = np.zeros(
                        (len(trainUtc), len(utcEncoder.classes_)),
                        dtype="bool")
                    for i in range(len(trainUtc)):
                        categorial[i, trainUtc[i]] = True
                    trainUtc = categorial

                    # User-Language (63 languages)
                    trainUserLang = langEncoder.transform(trainUserLang)
                    categorial = np.zeros(
                        (len(trainUserLang), len(langEncoder.classes_)),
                        dtype="bool")
                    for i in range(len(trainUserLang)):
                        categorial[i, trainUserLang[i]] = True
                    trainUserLang = categorial

                    # Tweet-Time (120 steps)
                    trainCreatedAt = timeEncoder.transform(trainCreatedAt)
                    categorial = np.zeros(
                        (len(trainCreatedAt), len(timeEncoder.classes_)),
                        dtype="bool")
                    for i in range(len(trainCreatedAt)):
                        categorial[i, trainCreatedAt[i]] = True
                    trainCreatedAt = categorial

                    # class label
                    classes = classEncoder.transform(trainLabels)

                    #yield trainDescriptions, classes
                    yield (
                        {
                            'inputDescription': trainDescriptions,
                            'inputDomain': trainDomain,
                            'inputTld': trainTld,
                            'inputLocation': trainLocation,
                            'inputSource': trainSource,
                            'inputText': trainTexts,
                            'inputUser': trainUserName,
                            'inputTimeZone': trainTZ,
                            'inputUTC': trainUtc,
                            'inputUserLang': trainUserLang,
                            'inputTweetTime': trainCreatedAt
                        },
                        #{'output': y}
                        classes)
예제 #2
0
def text_generator(trainingFile):
    with gzip.open(trainingFile, 'rb') as file:
        for line in file:
            instance = parseJsonLine(line.decode('utf-8'))
            yield str(instance.text)
예제 #3
0
##Load test-data
testDescription = []
testLinks = []
testLocations = []
testSource = []
testTexts = []
testUserName = []
testTimeZone = []
testUtc = []
testUserIds = []
testUserLang = []
testCreatedAt = []

f = open(testFile)
for line in f:
    instance = parseJsonLine(line)

    testDescription.append(str(instance.description))
    testLinks.append(extractPreprocessUrl(instance.urls))
    testLocations.append(str(instance.location))
    source = str(instance.source)
    testSource.append(source)
    testTexts.append(instance.text)
    testUserName.append(str(instance.name))
    testTimeZone.append(str(instance.timezone))
    testUtc.append(str(instance.utcOffset))
    testUserLang.append(str(instance.userLanguage))
    testCreatedAt.append(
        str(instance.createdAt.hour) + "-" +
        str(roundMinutes(instance.createdAt.minute)))
예제 #4
0
import numpy as np

from representation import parseJsonLine, Place

trainingFile = "data/train/training.twitter.json.gz"  #File with  all ~9 Million training tweets
placesFile = 'data/train/training.json.gz'  #Place annotation provided by task organisers

testTweet = "data/test/test.tweet.json"  # WNUT test file for tweets
testUser = "******"  # WNUT test file for User

#Parse Twitter-JSON
tweetToTextMapping = {}  # Map<Twitter-ID; tweet>
with gzip.open(trainingFile, 'rb') as file:
    for line in file:
        instance = parseJsonLine(line.decode('utf-8'))
        tweetToTextMapping[instance.id] = instance

#Parse and add gold-label for tweets
with gzip.open(placesFile, 'rb') as file:
    for line in file:
        parsed_json = json.loads(line.decode('utf-8'))
        tweetId = int(parsed_json["tweet_id"])
        if (tweetId in tweetToTextMapping):
            place = Place(name=parsed_json["tweet_city"],
                          lat=parsed_json["tweet_latitude"],
                          lon=parsed_json["tweet_longitude"])
            tweetToTextMapping[tweetId].place = place

#Extract all place gold-labels for all tweets
places = list(map(lambda x: x.place, tweetToTextMapping.values()))