Exemplo n.º 1
0
 def preload_maps(self, folder: str= None):
     if not folder:
         self.maps = {"char2idx": {"__PAD__": 0, "UNK": 1},
                      "word2idx": {"__PAD__": 0, "UNK": 1},
                      "tag2idx": {"__PAD__": 0}}
     else:
         self.maps = {"char2idx": sc.load_json(os.path.join(folder, "char2idx.json")),
                      "word2idx": sc.load_json(os.path.join(folder, "word2idx.json")),
                      "tag2idx": sc.load_json(os.path.join(folder, "tag2idx.json")),}
Exemplo n.º 2
0
def build_advertise_generator(files: List[str]):
    qeue = deque(files)
    total_files = len(qeue)
    process_counter = 0
    while len(qeue) > 0:
        try:
            file_name = qeue.pop()

            if "jsonl" in file_name:
                for line in open(file_name, "r", encoding="utf-8"):
                    ad = json.loads(line)
                    print(ad)
                    yield ad
            else:
                ad = sc.load_json(file_name)
                for advertise in ad:
                    yield advertise

            process_counter += 1
            sc.message("{} PROCESSED!".format(file_name))
            sc.message("Worker {}% complete!".format(
                round(process_counter / total_files, 2) * 100))
        except Exception as err:
            sc.message(err)
    return None
Exemplo n.º 3
0
 def preload_maps(self, folder: str = None):
     if folder:
         sc.message("Schema loaded !")
         tmp_schema = sc.load_json(folder)
         if "NUMBER_OF_PROPERTIES" in tmp_schema.keys():
             tmp_schema.pop("NUMBER_OF_PROPERTIES")
         return tmp_schema
     else:
         raise ValueError(
             "Parsed properties path cannot be None! Run schema pipe to build parsed props!"
         )
Exemplo n.º 4
0
    def reduce_process(self):
        workers_folder = [
            os.path.join(self.main_folder, folder)
            for folder in os.listdir(self.main_folder)
        ]

        for folder in workers_folder:
            g_schema = sc.load_json(os.path.join(folder,
                                                 "general_schema.json"))
            c_schema = sc.load_json(os.path.join(folder,
                                                 "schema_counter.json"))

            for k, v in g_schema.items():
                if k in self.unified_general.keys():
                    self.unified_general[k].extend(v)
                else:
                    self.unified_general[k] = v

            for k, v in c_schema.items():
                self.unified_counter[k] = self.unified_counter[
                    k] + v if k in self.unified_counter.keys() else v

        self.get_general_dist()
        self.save_ner_schema()
Exemplo n.º 5
0
    def reduce_process(self):
        workers_folder = [
            os.path.join(self.main_folder, folder)
            for folder in os.listdir(self.main_folder)
        ]

        if self.debug:
            print("Paths being aggregated...")
            print(workers_folder)

        char2idx = set()
        word2idx = set()
        tag2idx = set()

        sc.message("Processing files...")

        for folderzin in workers_folder:
            tmp_chars = sc.load_json(os.path.join(folderzin, "char2idx.json"))
            tmp_words = sc.load_json(os.path.join(folderzin, "word2idx.json"))
            tmp_tags = sc.load_json(os.path.join(folderzin, "tag2idx.json"))

            for c in tmp_chars.keys():
                char2idx.add(c)

            for w in tmp_words.keys():
                word2idx.add(w)

            for t in tmp_tags.keys():
                tag2idx.add(t)

        reduced_folder = sc.check_folder(
            os.path.join(self.output_folder, "ner_mapping"))
        basec2i = {"__PAD__": 0, "UNK": 1}
        basew2i = {"__PAD__": 0, "UNK": 1}
        baset2i = {"__PAD__": 0}

        char2idx.remove("__PAD__")
        char2idx.remove("UNK")
        word2idx.remove("__PAD__")
        word2idx.remove("UNK")
        tag2idx.remove("__PAD__")
        if "UNK" in tag2idx:
            tag2idx.remove("UNK")

        for i, c in enumerate(char2idx):
            basec2i[c] = i + 2

        for i, c in enumerate(word2idx):
            basew2i[c] = i + 2

        for i, c in enumerate(tag2idx):
            baset2i[c] = i + 1

        sc.message("Saving chars")
        sc.save_dict_2json(os.path.join(reduced_folder, "char2idx.json"),
                           basec2i)
        sc.message("Saving words")
        sc.save_dict_2json(os.path.join(reduced_folder, "word2idx.json"),
                           basew2i)
        sc.message("Saving tags")
        sc.save_dict_2json(os.path.join(reduced_folder, "tag2idx.json"),
                           baset2i)