示例#1
0
 def to_array(self):
     self.sentences = []
     for source, prefix in self.sources.items():
         with utils.smart_open(source) as fin:
             for item_no, line in enumerate(fin):
                 self.sentences.append(
                     LabeledSentence(tokenize(line),
                                     [prefix + '_%s' % item_no]))
     return self.sentences
示例#2
0
def train():
    df_train, df_test = load_dataset()

    prepare_data_for_diagrams(df_train, df_test)
    plot_distributions(df_train, df_test)

    df_train['text'] = df_train.apply(
        lambda x: tweet_tokenizer.tokenize(x.text), axis=1)
    df_test['text'] = df_test.apply(lambda x: tweet_tokenizer.tokenize(x.text),
                                    axis=1)

    vectorizer = get_vectorizer(df_train, df_test)
    embedding_layer = get_embedding(vectorizer)
    model = get_model(embedding_layer)

    df_train, df_val = train_test_split(df_train, test_size=0.2)

    x_train = vectorizer(df_train['text'].to_numpy()[..., np.newaxis]).numpy()
    x_val = vectorizer(df_val['text'].to_numpy()[..., np.newaxis]).numpy()

    y_train = df_train['target'].to_numpy()
    y_val = df_val['target'].to_numpy()

    model.compile("adam", "binary_crossentropy", metrics=["accuracy"])

    checkpoint = ModelCheckpoint(filepath="model.h5",
                                 monitor="val_loss",
                                 mode="min",
                                 verbose=1,
                                 save_best_only=True,
                                 save_weights_only=False)

    history = model.fit(x_train,
                        y_train,
                        batch_size=32,
                        epochs=nb_epochs,
                        validation_data=(x_val, y_val),
                        callbacks=[checkpoint])

    save_history(history)
示例#3
0
def read_test_data(filename,
                   vocab,
                   labels_map,
                   test_text_index,
                   test_label_index,
                   delimiter='\t'):

    complete = []
    for n in [filename]:

        y = open(n, "r")
        datareader = csv.reader(y, delimiter=delimiter)
        tokens = []

        for line in datareader:
            if line == []:
                continue
            else:
                label = line[test_label_index]
                if label not in labels_map:
                    print 'label does not exist', label
                    continue
                tempList = [str(labels_map[label])]
                tempDict = {}
                # row=line[test_text_index]#.replace("\n"," ")
                # row=re.sub(r"RT @\S+", "",row)
                # row=re.sub(r"MT @\S+", "",row)
                # row=' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",row).split())
                # x=row.lower().split()   # array of words in tweet
                x = tokenize(line[test_text_index])
            if len(x) < 1 or x == " ":
                continue
            # print x
            for item in x:
                if len(item) > 0:
                    if item.strip() in vocab.keys(
                    ):  # discard word that is not in vocab
                        if vocab[item.strip()] in tempDict.keys():
                            tempDict[vocab[item]] += 1
                        else:
                            tempDict[vocab[item]] = 1

            y = list(tempDict.keys())
            y.sort()
            for key in y:
                a = str(str(key) + ":" + str(tempDict[key]))
                tempList.append(a)
            b = " ".join(tempList)
            tokens.append(b)

        complete.extend(tokens)
    return complete
def read_data(filename, tweet_index):
    all_rows = []
    all_tokens = []
    file_in1 = open(filename, "r")
    datareader = csv.reader(file_in1, delimiter='\t')
    # next(datareader)
    for line in datareader:
        tweet = line[tweet_index]
        tokens = tokenize(tweet)
        tokenized_tweet = ' '.join(tokens)
        tokenized_row = line[0:tweet_index]
        tokenized_row.append(tokenized_tweet)
        all_rows.append(tokenized_row)
        all_tokens.append(tokens)
    return all_tokens, all_rows
def parse_tweet_json_gmove(file):
    data_dict = {}
    with open(INPUT_FOLDER + file) as f:
        lines = f.readlines()
    tweet_count = 0
    for line in lines:
        json_tweet = json.loads(line)
        id = json_tweet['tweetId']
        uid = json_tweet['userId']
        timestamp_ms = json_tweet['timestamp']
        text = json_tweet['message'].encode('utf-8').strip()
        if len(text) < TWEET_MINIMUM_LENGTH:
            continue
        if IS_TOKENIZED:
            tokens = tokenize(text)
            if FILTER_BY_KEYWORD:
                if not any(t in keywords for t in tokens):
                    continue
            text = ' '.join(tokens)
        loc = [json_tweet['lat'], json_tweet['lng']]
        if id and uid and timestamp_ms and loc:
            tweet_dict = {
                'uid': uid,
                'timestamp': timestamp_ms,
                'text': text,
                'lat': loc[0],
                'lon': loc[1]
            }
            data_dict[id] = tweet_dict

            tweet_count = tweet_count + 1
            if tweet_count % 10000 == 0:
                print "Processed ", tweet_count, " tweets"

    file_out = open(OUTPUT_FOLDER + file, 'w')
    for key in data_dict.keys():
        val = data_dict[key]
        line = '\t'.join(
            map(str, [
                key, val['uid'], val['lat'], val['lon'], val['timestamp'],
                val['text']
            ])) + '\n'
        file_out.write(line)
    file_out.close()
    return len(lines)
示例#6
0
def read_train_data(filename,
                    vocab,
                    labels_map,
                    train_text_index,
                    train_label_index,
                    delimiter="\t"):

    file_in1 = open(filename, "r")
    datareader = csv.reader(file_in1, delimiter=delimiter)
    tokens = []  #

    for line in datareader:
        label = line[train_label_index]
        if label not in labels_map:
            print 'label does not exist', label
            continue
        tempList = [str(labels_map[label])]
        tempDict = {}  # <word_index, number of times the word appear in tweet>
        x = tokenize(line[train_text_index])
        # for each word in tweet (x)
        for item in x:
            if len(item) > 0 and len(item.strip('\'"?,.')) > 0:
                if vocab[item] in tempDict.keys():
                    tempDict[vocab[item]] += 1
                else:
                    tempDict[vocab[item]] = 1

        y = list(tempDict.keys())
        y.sort()  # sorted word_indices
        for key in y:
            a = str(str(key) + ":" + str(tempDict[key]))

            tempList.append(a)
        b = " ".join(
            tempList
        )  #  a concatnated string of a list of " sorted_word_index:appear_time"
        #print tokens
        tokens.append(b)

    return tokens
def parse_tweet_json(file):
    data_dict = {}
    with open(INPUT_FOLDER + file) as f:
        lines = f.readlines()
    for line in lines:
        json_tweet = json.loads(line)
        id = json_tweet['id']
        uid = json_tweet['user']['id']
        timestamp_ms = json_tweet['timestamp_ms']
        text = json_tweet['text'].encode('utf-8').strip()
        if len(text) < TWEET_MINIMUM_LENGTH:
            continue
        if IS_TOKENIZED:
            tokens = tokenize(text)
            if FILTER_BY_KEYWORD:
                if not any(t in keywords for t in tokens):
                    continue
            text = ' '.join(tokens)
        loc = json_tweet['geo']['coordinates']
        if id and uid and timestamp_ms and loc:
            tweet_dict = {
                'uid': uid,
                'timestamp': timestamp_ms,
                'text': text,
                'lat': loc[0],
                'lon': loc[1]
            }
            data_dict[id] = tweet_dict

    file_out = open(OUTPUT_FOLDER + file, 'w')
    for key in data_dict.keys():
        val = data_dict[key]
        line = '\t'.join(
            map(str, [
                key, val['uid'], val['lat'], val['lon'], val['timestamp_ms'],
                val['text']
            ])) + '\n'
        file_out.write(line)
    file_out.close()
    return len(lines)
示例#8
0
 def __iter__(self):
     for source, prefix in self.sources.items():
         with utils.smart_open(source) as fin:
             for item_no, line in enumerate(fin):
                 yield LabeledSentence(tokenize(line),
                                       [prefix + '_%s' % item_no])