Exemplo n.º 1
0
def omd_hcr(in_path):
    data = []
    with open(in_path) as fid:
        soup = BeautifulSoup(fid.read(), "xml")
        for item in soup.findAll('item'):
            if item.attrs['label'] in ['positive', 'negative', 'neutral']:
                msg = item.find("content").text
                msg = preprocess(msg.decode("utf-8"))
                data.append([item.attrs['label'], msg])
    return data
Exemplo n.º 2
0
def semeval(in_path):
    data = []
    with codecs.open(in_path, "r", "utf-8") as fid:
        for l in fid:
            spt = l.replace("\n", "").split("\t")
            label = spt[0].replace("\"", "")
            tweet = spt[1]  #.decode("utf-8")
            if label in ['positive', 'negative', 'neutral']:
                tweet = preprocess(tweet)  #.encode("utf-8")
                ex = (label, tweet)
                data.append(ex)
    return data
Exemplo n.º 3
0
def stance(in_path, topic):
    data = []
    with open(in_path) as fid:
        for l in fid:
            spt = l.replace("\r\n", "").split("\t")
            current_topic, tweet, label = spt[1:]
            if current_topic == topic:
                if label in ['FAVOR', 'AGAINST', 'NONE']:
                    tweet = preprocess(tweet.decode("utf-8"))
                    ex = [label.lower(), tweet.encode("utf-8")]
                    data.append(ex)
    return data
Exemplo n.º 4
0
def casm(in_path):
    cache = dict()
    data = []
    labs = []
    with codecs.open(in_path, "r", "utf-8") as fid:
        for l in fid:
            spt = l.split("\t")
            label = spt[0].split(",")[1]
            tweet = preprocess(spt[1])
            if tweet in cache:
                continue
            cache[tweet] = True
            # print label, tweet
            ex = [label, tweet]
            labs.append(label)
            data.append(ex)
    print "labels: ", list(set(labs))
    return data
Exemplo n.º 5
0
if not os.path.exists(output):
    os.makedirs(output)
#### --- TRAIN DATA ---- ####
tweets_by_user = {}
print "[reading user tweets]"
z = 0
MAX_USERS = 100
MAX_USERS = float('inf')
MIN_TWEETS = 100
for fname in os.listdir(path_train):
    if os.path.splitext(path_train + fname)[1] != ".gz":
        print "ignored %s" % fname
        continue
    with gzip.open(path_train + fname, 'r') as f:
        user = fname[:fname.index(".")]
        data = [preprocess(json.loads(l)['text']) for l in f]
        # data = set([json.loads(l)['text'] for l in f])
        if len(data) < MIN_TWEETS:
            print "ignored user %s | %d tweets" % (user, len(data))
            continue
        tweets_by_user[user] = set(data)
        z += 1
    sys.stdout.write("\ruser: "******" (" + str(z) + ")" + " " * 20)
    sys.stdout.flush()
    if z >= MAX_USERS:
        print "out early!!!!"
        break

print "[writing user tweets]"
user_corpus = codecs.open(output + "mental_health_tweets", "w", "utf-8")
for user, twt in tweets_by_user.items():