Exemplo n.º 1
0
 def get_instances(self, label_file, xml_file):
     instances = []
     labels_final = set()
     tagger = PerceptronTagger(
     )  # load nltk perceptron just once to speed up tagging
     labels_dict = {
         0: "anger",
         1: "disgust",
         2: "fear",
         3: "joy",
         4: "sadness",
         5: "surprise"
     }
     tree = ET.parse(xml_file)
     root = tree.getroot()
     with open(label_file) as f:
         for sent, line in izip(root, f):
             id_xml = sent.attrib.values()[0]
             id_labels = line.rstrip().split()
             id_file = id_labels[0]
             if id_xml == id_file:
                 for i in sent.itertext():
                     text = i
                 labels = id_labels[1:]
                 label = labels.index(
                     str(max([int(label) for label in labels])))
                 inst = Instance(text, labels_dict[label])
                 inst_tokenized = word_tokenize(text)
                 inst_tagged = tagger.tag(inst_tokenized)
                 for tokentag in inst_tagged:
                     token = Token(tokentag[0], tokentag[1])
                     inst.add_token(token)
                 instances.append(inst)
                 labels_final.add(label)
         return instances, labels_final
Exemplo n.º 2
0
 def get_instances(self, folder):
     # happiness/joy???????????????????????????
     labels_dict = {
         "hp": "joy",
         "sd": "sadness",
         "ag": "anger",
         "dg": "disgust",
         "sp": "surprise",
         "fr": "fear"
     }
     instances = []
     labels = set()
     tagger = PerceptronTagger(
     )  # load nltk perceptron just once to speed up tagging
     with open(folder) as f:
         for line in f:
             label, id, text = line.strip().split(
                 " ", 2)  # split by first two spaces only
             if label == "ne":  # ignore no emotion
                 continue
             inst = Instance(text, labels_dict[label])
             inst_tokenized = word_tokenize(text)
             inst_tagged = tagger.tag(inst_tokenized)
             for tokentag in inst_tagged:
                 token = Token(tokentag[0], tokentag[1])
                 inst.add_token(token)
             instances.append(inst)
             labels.add(label)
     return instances, labels
Exemplo n.º 3
0
    def get_instances(self, folder):

        instances = []
        labels = set()
        for author in os.listdir(folder):
            path = folder + "/" + author + "/agree-sent/"
            path_pos = folder + "/" + author + "/pos/"
            if os.path.exists(path) and os.path.exists(path_pos):
                for af in os.listdir(path):
                    current = os.path.join(path, af)
                    current_pos = os.path.join(
                        path_pos,
                        af.split('.')[0] + '.sent.okpuncs.props.pos')
                    if os.path.isfile(current) and os.path.isfile(current_pos):
                        agree_data = open(current, "rb")
                        pos_data = open(current_pos, "rb").readlines()
                        for x in agree_data:
                            x = x.strip()
                            id = int(x.split("@")[0])
                            y = pos_data[id].strip()
                            label = int(x.split("@")[1])
                            text = x.split("@")[2]
                            inst = Instance(text, label)
                            for tagtoken in y.split("):("):
                                tag = tagtoken.split(" ")[0].lstrip("(")
                                token = tagtoken.split(" ")[1]
                                token = Token(token, tag)
                                inst.add_token(token)
                            instances.append(inst)
                            labels.add(label)

        return instances, labels
Exemplo n.º 4
0
 def get_instances(self, folder):
     instances = []
     labels = set()
     tagger = PerceptronTagger() # load nltk perceptron just once to speed up tagging
     with io.open(folder, encoding="utf-8") as f:
     # with open(folder) as f:
         for line in f:
             # line = unicode(line).encode("utf-8")
             line_split = line.rstrip().split("\t")
             if len(line_split) != 3:
                 continue
             id, text, label = line_split
             id = id.rstrip(":")
             text = re.sub('[#]', '', text.rstrip())
             label = re.sub('[^a-z]', '', label)
             inst = Instance(text, label)
             inst_tokenized = word_tokenize(text)
             inst_tagged = tagger.tag(inst_tokenized)
             for tokentag in inst_tagged:
                 token = Token(tokentag[0], tokentag[1])
                 inst.add_token(token)
             instances.append(inst)
             labels.add(label)
     return instances, labels