Пример #1
0
    def load_column(
        self,
        filename: str,
        text_ind: int,
        class_ind: int,
        separator=" ",
        maxlen=1000,
    ) -> None:
        '''loads text view from file with columns file

        Args:
            text_ind - column index of text
            class_ind - colum index of text class'''
        f = open(filename, "r")
        cur_text = graph.Node(self.graph, {})
        i = 0
        for line in f:
            line = line.strip()
            spl = split(line, separator)
            i = i + 1
            #    print (i)
            #    print (spl)
            #   if len(spl)>class_ind+1:
            #      print (str(i) + "too much columns on this indexs")
            #     print (line)
            #    print(spl[class_ind])
            if len(spl) > class_ind and len(spl) <= class_ind + 1:
                cur_text["text"] = spl[text_ind]
                cur_text["target_class"] = spl[class_ind].lower()
                self.texts.append(cur_text)
                cur_text = graph.Node(self.graph, {})
            else:
                pass
Пример #2
0
 def load_bd(self, filename: str) -> None:
     """
     Loads sentence view from big_dict structure file
     Args:
         filename: file name <str>
     """
     with open(filename) as f:
         data = json.load(f)
         column_names = [
             "word", "pos_start", "pos_end", "type", "tag", "class", "tone"
         ]
         self.columns = column_names
         for item in data:
             text = item['constant']['text']
             for i in range(0, len(text)):
                 sentence = text[i]
                 cur_sentence = graph.Node(self.graph, {})
                 cur_sentence["type"] = "sentence"
                 cur_sentence["words"] = graph.NodeList(self.graph)
                 for j in range(0, len(sentence)):
                     word = graph.Node(self.graph, {})
                     for c_name in column_names:
                         if c_name in ["class", "tone"]:
                             word[c_name] = sentence[j]["variable"][c_name][
                                 0]
                         elif c_name in ["pos_start", "pos_end"]:
                             word["pos_start"] = str(sentence[j]["pos"][0])
                             word["pos_end"] = str(sentence[j]["pos"][1])
                         else:
                             word[c_name] = sentence[j][c_name]
                     cur_sentence["words"].append(word)
                 self.sentences.append(cur_sentence)
Пример #3
0
    def get_text(self, text):
        words = load_words_save_separators(text)

        wsents = get_sentences(words, text)

        self.columns = ["type", "pos_start", "pos_end", "word"]
        for sent in wsents:
            cur_sentence = graph.Node(self.graph, {})
            cur_sentence["words"] = graph.NodeList(self.graph)
            cur_sentence["type"] = "sentence"
            for word_i in sent:
                word = graph.Node(self.graph, {})
                word["type"] = "word"
                word["word"] = word_i["word"]
                word["pos_start"] = word_i["pos_start"]
                word["pos_end"] = word_i["pos_end"]
                cur_sentence["words"].append(word)
            self.sentences.append(cur_sentence)
Пример #4
0
    def load_column(self,
                    filename: str,
                    separator=" ",
                    maxlen=1000,
                    default_class=None,
                    expand=False,
                    maxwords=None) -> None:
        '''loads sentence view from file with columns file'''
        f = open(filename, "r")
        column_names = split(f.readline().strip(), separator)
        cur_sentence = graph.Node(self.graph, {})
        cur_sentence["words"] = graph.NodeList(self.graph)
        self.columns = column_names
        line_num = 0
        self.extended = expand
        for line in f:
            line = line.strip()
            line = line.replace("\ufeff", "")
            line_num = line_num + 1
            if (maxwords is not None) and (line_num > maxwords):
                return
            if "<STOP>" in line or line == "" or len(
                    cur_sentence["words"]) > maxlen:
                if len(cur_sentence["words"]) > 0:
                    self.sentences.append(cur_sentence)
                    cur_sentence = graph.Node(self.graph, {})
                    cur_sentence["words"] = graph.NodeList(self.graph)
                    cur_sentence["type"] = "sentence"
            else:
                word = graph.Node(self.graph, {})
                words = split(line, separator)
                #        print (len(words))
                #       print (len(column_names))
                #TODO: This code is horrible: need to simplify
                #normal situation
                if len(words) == len(column_names):
                    for i in range(0, len(column_names)):
                        word[column_names[i]] = words[i]
                        if expand == True:
                            nn = graph.Node(self.graph, {
                                "type": column_names[i],
                                "value": words[i]
                            })
                            word.Connect(nn)
                #less words then colums - fill with default class if given
                if len(words) < len(column_names):

                    if default_class is None:
                        print(
                            "Error: on col " + str(line_num) +
                            " not enough data and no default class provided, skipping"
                        )
                    else:
                        #fill specified columns
                        for i in range(0, len(words)):
                            word[column_names[i]] = words[i]
                            if expand is True:
                                nn = graph.Node(self.graph, {
                                    "type": column_names[i],
                                    "value": words[i]
                                })
                                word.Connect(nn)
                        #fill unspecified colums with default value
                    #    print (range(len(words), len(self.columns)))
                    #   print (words)
                        for i in range(len(words), len(self.columns)):
                            word[column_names[i]] = default_class
                            if expand is True:
                                nn = graph.Node(
                                    self.graph, {
                                        "type": column_names[i],
                                        "value": default_class
                                    })
                                word.Connect(nn)

                if len(words) > len(column_names):
                    #we have too many classes
                    #fill normal range
                    for i in range(0, len(column_names)):
                        word[column_names[i]] = words[i]
                        if expand == True:
                            nn = graph.Node(self.graph, {
                                "type": column_names[i],
                                "value": words[i]
                            })
                            word.Connect(nn)
                    #if expand = true, fill load other classes into graph

                    for i in range(len(column_names), len(words)):
                        if expand == True:
                            nn = graph.Node(self.graph, {
                                "type": column_names[-1],
                                "value": words[i]
                            })
                            word.Connect(nn)
                #  print (word.children({"type":column_names[-1]}).Distinct("value"))

                word["type"] = "word"

                cur_sentence["words"].append(word)
        if len(cur_sentence["words"]) > 0:
            self.sentences.append(cur_sentence)
Пример #5
0
 def load_sentences(self, sentences):
     self.sentences = sentences
     for sentence in sentences:
         for word in sentence["words"]:
             graph.Node(self.graph, word)