Пример #1
0
testLocations = []
testSource = []
testTexts = []
testUserName = []
testTimeZone = []
testUtc = []
testUserIds = []
testUserLang = []
testCreatedAt = []

f = open(testFile)
for line in f:
    instance = parseJsonLine(line)

    testDescription.append(str(instance.description))
    testLinks.append(extractPreprocessUrl(instance.urls))
    testLocations.append(str(instance.location))
    source = str(instance.source)
    testSource.append(source)
    testTexts.append(instance.text)
    testUserName.append(str(instance.name))
    testTimeZone.append(str(instance.timezone))
    testUtc.append(str(instance.utcOffset))
    testUserLang.append(str(instance.userLanguage))
    testCreatedAt.append(
        str(instance.createdAt.hour) + "-" +
        str(roundMinutes(instance.createdAt.minute)))

    testUserIds.append(instance.userName)

#############################
def batch_generator(twitterFile, goldstandard, batch_size=64):
    while True:  #TODO: needed?
        with gzip.open(twitterFile, 'rb') as file:
            trainDescriptions = []
            trainLinks = []
            trainLocation = []
            trainSource = []
            trainTexts = []
            trainUserName = []
            trainTZ = []
            trainUtc = []
            trainUserLang = []
            trainCreatedAt = []
            trainUserMentions = []
            trainLabels = []
            for line in file:

                if len(trainDescriptions) == batch_size:
                    trainDescriptions = []
                    trainLinks = []
                    trainLocation = []
                    trainSource = []
                    trainTexts = []
                    trainUserName = []
                    trainTZ = []
                    trainUtc = []
                    trainUserLang = []
                    trainCreatedAt = []
                    trainUserMentions = []
                    trainLabels = []

                instance = parseJsonLine(line.decode('utf-8'))

                trainDescriptions.append(str(instance.description))
                trainLinks.append(extractPreprocessUrl(instance.urls))
                trainLocation.append(str(instance.location))
                trainSource.append(str(instance.source))
                trainTexts.append(instance.text)
                trainUserName.append(str(instance.name))
                trainTZ.append(str(instance.timezone))
                trainUtc.append(str(instance.utcOffset))
                trainUserLang.append(str(instance.userLanguage))
                trainCreatedAt.append(
                    str(instance.createdAt.hour) + "-" +
                    str(roundMinutes(instance.createdAt.minute)))
                trainUserMentions.append(instance.userMentions)

                trainLabel = goldstandard[instance.id]._name
                trainLabels.append(trainLabel)

                #print(str(instance.id) +"\t" +str(len(trainDescriptions)))

                if len(trainDescriptions) == batch_size:

                    #Descriptions
                    trainDescriptions = descriptionTokenizer.texts_to_sequences(
                        trainDescriptions)
                    trainDescriptions = np.asarray(
                        trainDescriptions)  # Convert to ndArraytop
                    trainDescriptions = pad_sequences(
                        trainDescriptions, maxlen=MAX_DESC_SEQUENCE_LENGTH)

                    # Link-Mentions
                    trainDomain = list(map(lambda x: x[0],
                                           trainLinks))  # URL-Domain
                    categorial = np.zeros(
                        (len(trainDomain), len(domainEncoder.classes_)),
                        dtype="bool")
                    for i in range(len(trainDomain)):
                        if trainDomain[i] in domainEncoder.classes_:
                            categorial[i,
                                       domainEncoder.
                                       transform([trainDomain[i]])[0]] = True
                    trainDomain = categorial

                    trainTld = list(
                        map(lambda x: x[1],
                            trainLinks))  # Url suffix; top level domain
                    categorial = np.zeros(
                        (len(trainTld), len(tldEncoder.classes_)),
                        dtype="bool")
                    for i in range(len(trainTld)):
                        if trainTld[i] in tldEncoder.classes_:
                            categorial[
                                i,
                                tldEncoder.transform([trainTld[i]])[0]] = True
                    trainTld = categorial

                    # Location
                    trainLocation = locationTokenizer.texts_to_sequences(
                        trainLocation)
                    trainLocation = np.asarray(
                        trainLocation)  # Convert to ndArraytop
                    trainLocation = pad_sequences(
                        trainLocation, maxlen=MAX_LOC_SEQUENCE_LENGTH)

                    # Source
                    trainSource = sourceEncoder.transform(trainSource)
                    categorial = np.zeros(
                        (len(trainSource), len(sourceEncoder.classes_)),
                        dtype="bool")
                    for i in range(len(trainSource)):
                        categorial[i, trainSource[i]] = True
                    trainSource = categorial

                    #Text Tweet
                    trainTexts = textTokenizer.texts_to_sequences(trainTexts)
                    trainTexts = np.asarray(
                        trainTexts)  # Convert to ndArraytop
                    trainTexts = pad_sequences(trainTexts,
                                               maxlen=MAX_TEXT_SEQUENCE_LENGTH)

                    #User Name
                    trainUserName = nameTokenizer.texts_to_sequences(
                        trainUserName)
                    trainUserName = np.asarray(
                        trainUserName)  # Convert to ndArraytop
                    trainUserName = pad_sequences(
                        trainUserName, maxlen=MAX_NAME_SEQUENCE_LENGTH)

                    #Time Zone
                    trainTZ = timeZoneTokenizer.texts_to_sequences(trainTZ)
                    trainTZ = np.asarray(trainTZ)  # Convert to ndArraytop
                    trainTZ = pad_sequences(trainTZ,
                                            maxlen=MAX_TZ_SEQUENCE_LENGTH)

                    # UTC
                    trainUtc = utcEncoder.transform(trainUtc)
                    categorial = np.zeros(
                        (len(trainUtc), len(utcEncoder.classes_)),
                        dtype="bool")
                    for i in range(len(trainUtc)):
                        categorial[i, trainUtc[i]] = True
                    trainUtc = categorial

                    # User-Language (63 languages)
                    trainUserLang = langEncoder.transform(trainUserLang)
                    categorial = np.zeros(
                        (len(trainUserLang), len(langEncoder.classes_)),
                        dtype="bool")
                    for i in range(len(trainUserLang)):
                        categorial[i, trainUserLang[i]] = True
                    trainUserLang = categorial

                    # Tweet-Time (120 steps)
                    trainCreatedAt = timeEncoder.transform(trainCreatedAt)
                    categorial = np.zeros(
                        (len(trainCreatedAt), len(timeEncoder.classes_)),
                        dtype="bool")
                    for i in range(len(trainCreatedAt)):
                        categorial[i, trainCreatedAt[i]] = True
                    trainCreatedAt = categorial

                    # class label
                    classes = classEncoder.transform(trainLabels)

                    #yield trainDescriptions, classes
                    yield (
                        {
                            'inputDescription': trainDescriptions,
                            'inputDomain': trainDomain,
                            'inputTld': trainTld,
                            'inputLocation': trainLocation,
                            'inputSource': trainSource,
                            'inputText': trainTexts,
                            'inputUser': trainUserName,
                            'inputTimeZone': trainTZ,
                            'inputUTC': trainUtc,
                            'inputUserLang': trainUserLang,
                            'inputTweetTime': trainCreatedAt
                        },
                        #{'output': y}
                        classes)
Пример #3
0
trainDescription = []
trainLinks = []
trainLocation = []
trainSource = []
trainTexts = []
trainUserName = []
trainTZ = []
trainUtc = []
trainUserLang = []
trainSinTime = []
trainCosTime = []
for key in tweetToTextMapping:
    trainLabels.append(tweetToTextMapping[key].place._name)

    trainDescription.append(str(tweetToTextMapping[key].description))
    trainLinks.append(extractPreprocessUrl(tweetToTextMapping[key].urls))
    trainLocation.append(str(tweetToTextMapping[key].location))
    trainSource.append(str(tweetToTextMapping[key].source))
    trainTexts.append(tweetToTextMapping[key].text)
    trainUserName.append(str(tweetToTextMapping[key].name))
    trainTZ.append(str(tweetToTextMapping[key].timezone))
    trainUtc.append(str(tweetToTextMapping[key].utcOffset))
    trainUserLang.append(str(tweetToTextMapping[key].userLanguage))

    t = tweetToTextMapping[key].createdAt.hour * 60 * 60 + tweetToTextMapping[
        key].createdAt.minute * 60 + tweetToTextMapping[key].createdAt.second
    t = 2 * np.pi * t / (24 * 60 * 60)
    trainSinTime.append(np.sin(t))
    trainCosTime.append(np.cos(t))

trainCreatedAt = np.column_stack((trainSinTime, trainCosTime))