def format_test(src, target): tids = [] tweets = [] for line in file(src): tids.append(line.strip().split('\t')[0]) tweets.append(line.strip().split('\t')[1]) tokens, tags = tweet_tagger.runtagger_parse(tweets) out = open(target, 'w') for tid, token, tag in zip(tids, tokens, tags): # assume each test tweet is labeled the non-ADR class (0) out.write(tid + '\t0\t' + clean_tweet(token, tag) + '\n') out.close()
def format_train_dev(src, target): tids = [] labels = [] tweets = [] for line in file(src): tids.append(line.strip().split('\t')[0]) labels.append(line.strip().split('\t')[2]) tweets.append(line.strip().split('\t')[3]) tokens, tags = tweet_tagger.runtagger_parse(tweets) out = open(target, 'w') for tid, label, token, tag in zip(tids, labels, tokens, tags): out.write(tid + '\t' + label + '\t' + clean_tweet(token, tag) + '\n') out.close()