def read_semeval(binary=False): # ---- semeval all_msgs = [] for fname in [ "semeval_train_complete.txt", "Twitter2013_raw.txt", "Twitter2014_raw.txt", "Twitter2015_raw.txt" ]: msgs = [] with codecs.open(DATA_IN + "semeval/%s" % fname, "r", "utf-8") as fid: for l in fid: spt = l.replace("\n", "").split("\t") label = spt[0].replace("\"", "") if label == "objective-OR-neutral": label = "neutral" if (binary and label not in ["positive","negative"]) \ or label not in ["positive","neutral","negative"] : continue tweet = spt[1] tweet = preprocess(tweet) ex = (label, tweet) msgs.append(ex) shuffle(msgs) all_msgs += msgs with codecs.open(DATA_OUT + fname.lower(), "w", "utf-8") as fod: for ex in msgs: fod.write('\t'.join(ex) + "\n") return all_msgs
def read_hcr(binary=False): # ---- HCR all_msgs = [] for f in ["dev.xml", "train.xml", "test.xml"]: msgs = [] with open(DATA_IN + "hcr/%s" % f) as fid: soup = BeautifulSoup(fid.read(), "xml") for item in soup.findAll('item'): if (binary and item.attrs['label'] not in ["positive","negative"]) \ or item.attrs['label'] not in ["positive","neutral","negative"] : continue msg = item.find("content").text msg = preprocess(msg.decode("utf-8")) ex = (item.attrs['label'], msg) msgs.append(ex) shuffle(msgs) all_msgs += msgs fname = "hcr_%s.txt" % f.replace(".xml", "") with open(DATA_OUT + fname, "w") as fod: for ex in msgs: fod.write('\t'.join(ex) + "\n") return all_msgs
def inference(text): text[0] = preprocess(text[0]) device = torch.device("cpu") hidden_size = 256 model = MyModel0(len(VOCAB), 16, hidden_size).to(device) model.load_state_dict( torch.load("model.pth", map_location=torch.device('cpu'))) #text = ["shubham bisht, something happens"] text_tensor = torch.zeros(len(text[0]), 1, dtype=torch.long) text_tensor[:, 0] = torch.LongTensor([VOCAB.find(c) for c in text[0].upper()]) #print(text_tensor) inp = text_tensor.to(device) oupt = model(inp) prob = torch.nn.functional.softmax(oupt, dim=2) prob, pred = torch.max(prob, dim=2) color_print(text[0], pred) json = pred_to_dict(text[0], pred, prob) print("\n###########################\n") print(json) return json
if __name__ == "__main__": parser = get_parser() args = parser.parse_args() idz = [] print "Preprocess Data" with codecs.open(args.out_txt,"w","utf-8") as fod: with codecs.open(args.input,"r","utf-8") as fid: msgs = [] for line in fid: clean_line = re.sub('[\n\r\'\"]', '', line) clean_line = clean_line.replace("#sarcasm", "").replace("#sarcastic", "") st = clean_line.split("\t") if len(st) != 4: set_trace() tweet_id, user, label, m = st idz.append(int(tweet_id)) m = ut.preprocess(m, sep_emoji=True) fod.write(u"%s\t%s\t%s\t%s\n" % (tweet_id,user,label,m)) msgs.append(m) #compute word index wrd2idx = ut.word_2_idx(msgs) print "Load Word Embeddings" emb_utils.save_embeddings_txt(args.word_vectors, args.out_vectors, wrd2idx) # pre-compute the crossvalidation folds so that different models # can be compared on the same data splits build_folds(idz)
# import MeCab # tagger = MeCab.Tagger("-Owakati") # result = tagger.parse("I have a pen. You have a dance.") # print(result.split()) from my_utils import preprocess # window_size = 1 # hidden_size = 5 # 中間層のサイズ(単語ベクトルの次元数) japanese_text = "梅雨で雨の日が多いですね。早く梅雨が明けて欲しいですね。" corpus, word_to_id, id_to_word = preprocess(japanese_text) print(corpus) print(word_to_id) print(id_to_word) model = SimpleCBOW()
) return parser if __name__ == "__main__": parser = get_parser() args = parser.parse_args() idz = [] print "Preprocess Data" with codecs.open(args.out_txt, "w", "utf-8") as fod: with codecs.open(args.input, "r", "utf-8") as fid: msgs = [] for line in fid: clean_line = re.sub('[\n\r\'\"]', '', line) clean_line = clean_line.replace("#sarcasm", "").replace("#sarcastic", "") st = clean_line.split("\t") if len(st) != 4: set_trace() tweet_id, user, label, m = st idz.append(int(tweet_id)) m = ut.preprocess(m, sep_emoji=True) fod.write(u"%s\t%s\t%s\t%s\n" % (tweet_id, user, label, m)) msgs.append(m) #compute word index wrd2idx = ut.word_2_idx(msgs) print "Load Word Embeddings" emb_utils.save_embeddings_txt(args.word_vectors, args.out_vectors, wrd2idx) # pre-compute the crossvalidation folds so that different models # can be compared on the same data splits build_folds(idz)