def to_array(self): self.sentences = [] for source, prefix in self.sources.items(): with utils.smart_open(source) as fin: for item_no, line in enumerate(fin): self.sentences.append( LabeledSentence(tokenize(line), [prefix + '_%s' % item_no])) return self.sentences
def train(): df_train, df_test = load_dataset() prepare_data_for_diagrams(df_train, df_test) plot_distributions(df_train, df_test) df_train['text'] = df_train.apply( lambda x: tweet_tokenizer.tokenize(x.text), axis=1) df_test['text'] = df_test.apply(lambda x: tweet_tokenizer.tokenize(x.text), axis=1) vectorizer = get_vectorizer(df_train, df_test) embedding_layer = get_embedding(vectorizer) model = get_model(embedding_layer) df_train, df_val = train_test_split(df_train, test_size=0.2) x_train = vectorizer(df_train['text'].to_numpy()[..., np.newaxis]).numpy() x_val = vectorizer(df_val['text'].to_numpy()[..., np.newaxis]).numpy() y_train = df_train['target'].to_numpy() y_val = df_val['target'].to_numpy() model.compile("adam", "binary_crossentropy", metrics=["accuracy"]) checkpoint = ModelCheckpoint(filepath="model.h5", monitor="val_loss", mode="min", verbose=1, save_best_only=True, save_weights_only=False) history = model.fit(x_train, y_train, batch_size=32, epochs=nb_epochs, validation_data=(x_val, y_val), callbacks=[checkpoint]) save_history(history)
def read_test_data(filename, vocab, labels_map, test_text_index, test_label_index, delimiter='\t'): complete = [] for n in [filename]: y = open(n, "r") datareader = csv.reader(y, delimiter=delimiter) tokens = [] for line in datareader: if line == []: continue else: label = line[test_label_index] if label not in labels_map: print 'label does not exist', label continue tempList = [str(labels_map[label])] tempDict = {} # row=line[test_text_index]#.replace("\n"," ") # row=re.sub(r"RT @\S+", "",row) # row=re.sub(r"MT @\S+", "",row) # row=' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",row).split()) # x=row.lower().split() # array of words in tweet x = tokenize(line[test_text_index]) if len(x) < 1 or x == " ": continue # print x for item in x: if len(item) > 0: if item.strip() in vocab.keys( ): # discard word that is not in vocab if vocab[item.strip()] in tempDict.keys(): tempDict[vocab[item]] += 1 else: tempDict[vocab[item]] = 1 y = list(tempDict.keys()) y.sort() for key in y: a = str(str(key) + ":" + str(tempDict[key])) tempList.append(a) b = " ".join(tempList) tokens.append(b) complete.extend(tokens) return complete
def read_data(filename, tweet_index): all_rows = [] all_tokens = [] file_in1 = open(filename, "r") datareader = csv.reader(file_in1, delimiter='\t') # next(datareader) for line in datareader: tweet = line[tweet_index] tokens = tokenize(tweet) tokenized_tweet = ' '.join(tokens) tokenized_row = line[0:tweet_index] tokenized_row.append(tokenized_tweet) all_rows.append(tokenized_row) all_tokens.append(tokens) return all_tokens, all_rows
def parse_tweet_json_gmove(file): data_dict = {} with open(INPUT_FOLDER + file) as f: lines = f.readlines() tweet_count = 0 for line in lines: json_tweet = json.loads(line) id = json_tweet['tweetId'] uid = json_tweet['userId'] timestamp_ms = json_tweet['timestamp'] text = json_tweet['message'].encode('utf-8').strip() if len(text) < TWEET_MINIMUM_LENGTH: continue if IS_TOKENIZED: tokens = tokenize(text) if FILTER_BY_KEYWORD: if not any(t in keywords for t in tokens): continue text = ' '.join(tokens) loc = [json_tweet['lat'], json_tweet['lng']] if id and uid and timestamp_ms and loc: tweet_dict = { 'uid': uid, 'timestamp': timestamp_ms, 'text': text, 'lat': loc[0], 'lon': loc[1] } data_dict[id] = tweet_dict tweet_count = tweet_count + 1 if tweet_count % 10000 == 0: print "Processed ", tweet_count, " tweets" file_out = open(OUTPUT_FOLDER + file, 'w') for key in data_dict.keys(): val = data_dict[key] line = '\t'.join( map(str, [ key, val['uid'], val['lat'], val['lon'], val['timestamp'], val['text'] ])) + '\n' file_out.write(line) file_out.close() return len(lines)
def read_train_data(filename, vocab, labels_map, train_text_index, train_label_index, delimiter="\t"): file_in1 = open(filename, "r") datareader = csv.reader(file_in1, delimiter=delimiter) tokens = [] # for line in datareader: label = line[train_label_index] if label not in labels_map: print 'label does not exist', label continue tempList = [str(labels_map[label])] tempDict = {} # <word_index, number of times the word appear in tweet> x = tokenize(line[train_text_index]) # for each word in tweet (x) for item in x: if len(item) > 0 and len(item.strip('\'"?,.')) > 0: if vocab[item] in tempDict.keys(): tempDict[vocab[item]] += 1 else: tempDict[vocab[item]] = 1 y = list(tempDict.keys()) y.sort() # sorted word_indices for key in y: a = str(str(key) + ":" + str(tempDict[key])) tempList.append(a) b = " ".join( tempList ) # a concatnated string of a list of " sorted_word_index:appear_time" #print tokens tokens.append(b) return tokens
def parse_tweet_json(file): data_dict = {} with open(INPUT_FOLDER + file) as f: lines = f.readlines() for line in lines: json_tweet = json.loads(line) id = json_tweet['id'] uid = json_tweet['user']['id'] timestamp_ms = json_tweet['timestamp_ms'] text = json_tweet['text'].encode('utf-8').strip() if len(text) < TWEET_MINIMUM_LENGTH: continue if IS_TOKENIZED: tokens = tokenize(text) if FILTER_BY_KEYWORD: if not any(t in keywords for t in tokens): continue text = ' '.join(tokens) loc = json_tweet['geo']['coordinates'] if id and uid and timestamp_ms and loc: tweet_dict = { 'uid': uid, 'timestamp': timestamp_ms, 'text': text, 'lat': loc[0], 'lon': loc[1] } data_dict[id] = tweet_dict file_out = open(OUTPUT_FOLDER + file, 'w') for key in data_dict.keys(): val = data_dict[key] line = '\t'.join( map(str, [ key, val['uid'], val['lat'], val['lon'], val['timestamp_ms'], val['text'] ])) + '\n' file_out.write(line) file_out.close() return len(lines)
def __iter__(self): for source, prefix in self.sources.items(): with utils.smart_open(source) as fin: for item_no, line in enumerate(fin): yield LabeledSentence(tokenize(line), [prefix + '_%s' % item_no])