示例#1
0
 def add_pinlei_tag_yyh(self):
     LogInfo.begin_track("Begin adding tags for pinleis...")
     fin = codecs.open(self.root_fp + "/yyh_w2v_train.txt",
                       'r',
                       encoding='utf-8')
     fout = codecs.open(self.root_fp + "/yyh_w2v_train.txt.pinlei_tag",
                        'w',
                        encoding='utf-8')
     cnt = 0
     for line in fin:
         spt = line.strip().split()
         new_line = ""
         i = 0
         while i < len(spt):
             if i + 3 < len(spt):
                 str4 = spt[i] + spt[i + 1] + spt[i + 2] + spt[3]
                 if str4 in self.pinlei_set:
                     LogInfo.logs("Found 4-term pinlei [%s|%s|%s|%s]",
                                  spt[i], spt[i + 1], spt[i + 2],
                                  spt[i + 3])
                     new_line += "[[" + str4 + "]] "
                     i += 4
                     continue
             if i + 2 < len(spt):
                 str3 = spt[i] + spt[i + 1] + spt[i + 2]
                 if str3 in self.pinlei_set:
                     # LogInfo.logs("Found 3-term pinlei [%s|%s|%s]",
                     #              spt[i], spt[i+1], spt[i+2])
                     new_line += "[[" + str3 + "]] "
                     i += 3
                     continue
             if i + 1 < len(spt):
                 str2 = spt[i] + spt[i + 1]
                 if str2 in self.pinlei_set:
                     # LogInfo.logs("Found 2-term pinlei [%s|%s]",
                     #              spt[i], spt[i+1])
                     new_line += "[[" + str2 + "]] "
                     i += 2
                     continue
             if spt[i] in self.pinlei_set:
                 # LogInfo.logs("Found pinlei [%s]", spt[i])
                 new_line += "[[" + spt[i] + "]] "
                 i += 1
                 continue
             new_line += spt[i] + " "
             i += 1
         fout.write(new_line + "\n")
         cnt += 1
         if cnt < 5:
             LogInfo.logs("res ==> (%s)", new_line)
         LogInfo.show_line(cnt, 100000)
     fin.close()
     fout.close()
     LogInfo.end_track("Pinlei tags added.")
示例#2
0
 def load(self, data_file, encoding):
     LogInfo.begin_track("Loading data from %s...", data_file)
     if os.path.isfile(data_file):
         LogInfo.begin_track("[Exist] Loading from %s...", data_file)
         query_idxs, query_lens, labels, intents, link_masks, entity_idxs \
             = list(), list(), list(), list(), list(), list()
         cnt = 0
         with codecs.open(data_file, 'r', encoding=encoding) as fin:
             for line in fin:
                 spt = line.strip().split("\t")
                 query_idxs.append([int(idx) for idx in spt[0].split(" ")])
                 query_lens.append(int(spt[1]))
                 labels.append([int(idx) for idx in spt[2].split(" ")])
                 intents.append(int(spt[3]))
                 link_masks.append([int(idx) for idx in spt[4].split(" ")])
                 entity_idxs.append([int(idx) for idx in spt[5].split(" ")])
                 cnt += 1
                 LogInfo.show_line(cnt, 1000000)
         LogInfo.end_track("Max_seq_len = %d.", self.max_seq_len)
     else:
         txt_data_file = data_file + ".name"
         LogInfo.begin_track("[Not Exist] Loading from %s...",
                             txt_data_file)
         query_idxs, query_lens, labels, intents, link_masks, entity_idxs \
             = list(), list(), list(), list(), list(), list()
         cnt = 0
         fout = codecs.open(data_file, 'w', encoding=encoding)
         with codecs.open(txt_data_file, 'r', encoding=encoding) as fin:
             for line in fin:
                 query_idx, query_len, label, intent, link_mask, entity_idx\
                     = self.decode_line(line)
                 fout.write(" ".join([str(x) for x in query_idx]) + "\t" +
                            str(query_len) + "\t" +
                            " ".join([str(x) for x in label]) + "\t" +
                            str(intent) + "\t" +
                            " ".join([str(x) for x in link_mask]) + "\t" +
                            " ".join([str(x) for x in entity_idx]) + "\n")
                 query_idxs.append(query_idx)
                 query_lens.append(query_len)
                 labels.append(label)
                 intents.append(intent)
                 link_masks.append(link_mask)
                 entity_idxs.append(entity_idx)
                 cnt += 1
                 LogInfo.show_line(cnt, 1000000)
         fout.close()
         LogInfo.logs("Write into %s.", data_file)
         LogInfo.end_track("Max_seq_len = %d.", self.max)
     self.data = list(
         zip(query_idxs, query_lens, labels, intents, link_masks,
             entity_idxs))
     self.data_size = len(self.data)
     LogInfo.end_track("Loaded. Size: %d.", self.data_size)
示例#3
0
    def load_vocab_embedding(self, embedding_file, encoding):
        LogInfo.begin_track("Loading embeddings from %s...", embedding_file)
        vocab_embedding = len(self.vocab_index_dict) * [None]
        with codecs.open(embedding_file, 'r', encoding=encoding) as fin:
            count = 0
            for line in fin:
                strs = line.split()
                embedding = [float(strs[i].strip()) for i in range(1, len(strs))]
                vocab_embedding[self.vocab_index_dict[strs[0].strip()]] = embedding
                count += 1
                LogInfo.show_line(count, 50000)

        assert count == len(vocab_embedding)
        self.vocab_embedding = np.asarray(vocab_embedding)
        LogInfo.end_track("Vocab loaded. Size: %d.", self.vocab_size)
示例#4
0
    def load_vocab_name(self, vocab_file, encoding):
        LogInfo.begin_track("Loading vocab from %s...", vocab_file)
        self.vocab_size = 0
        self.index_vocab_dict.clear()
        self.vocab_index_dict.clear()
        with codecs.open(vocab_file, 'r', encoding=encoding) as fin:
            index = 0
            for line in fin:
                self.vocab_index_dict[line.strip()] = index
                self.index_vocab_dict.append(line.strip())
                index += 1
                LogInfo.show_line(index, 50000)

        self.vocab_size = index
        LogInfo.end_track("Vocab loaded. Size: %d.", self.vocab_size)
示例#5
0
 def load(self, data_file, encoding):
     LogInfo.begin_track("Loading data from %s...", data_file)
     context_idxs, context_seqs, pinlei_idxs = list(), list(), list()
     cnt = 0
     with codecs.open(data_file, 'r', encoding=encoding) as fin:
         for line in fin:
             context_idx, context_seq, pinlei_idx = self.decode_line(line)
             context_idxs.append(context_idx)
             context_seqs.append(context_seq)
             pinlei_idxs.append(pinlei_idx)
             cnt += 1
             LogInfo.show_line(cnt, 10000)
     self.data = list(zip(context_idxs, context_seqs, pinlei_idxs))
     self.data_size = len(self.data)
     LogInfo.end_track()
示例#6
0
 def load_vocab(self, vocab_file, embedding_dim, encoding):
     LogInfo.begin_track("Loading vocab from %s...", vocab_file)
     self.vocab_size = 0
     self.index_vocab_dict.clear()
     self.vocab_index_dict.clear()
     self.vocab_embedding.clear()
     with codecs.open(vocab_file, 'r', encoding=encoding) as fin:
         index = 0
         # 0 embedding for not-found query term
         self.vocab_index_dict["[[NULL]]"] = index
         self.index_vocab_dict.append("[[NULL]]")
         self.vocab_embedding.append([0.0 for _ in range(embedding_dim)])
         index += 1
         for line in fin:
             spt = line.strip().split()
             self.vocab_index_dict[spt[0]] = index
             self.index_vocab_dict.append(spt[0])
             embedding = [float(spt[i].strip()) for i in range(1, len(spt))]
             self.vocab_embedding.append(embedding)
             index += 1
             LogInfo.show_line(index, 50000)
     self.vocab_size = len(self.vocab_embedding)
     self.vocab_embedding = np.array(self.vocab_embedding)
     LogInfo.end_track("Vocab loaded. Size: %d.", self.vocab_size)
示例#7
0
    def load_data(self):
        """
        load data from files
        :return: 
        """
        LogInfo.begin_track("Loading data...")
        # with open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as finc, \
        #         open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as fine:
        with open("/home/yuchen/CppFiles/Causal/sync_wdFin_iter200.txt") as finc, \
                open("/home/yuchen/CppFiles/Causal/syneneg_wdFin_iter200.txt") as fine:
            cnt = 0
            for linec, linee in zip(finc, fine):
                cnt += 1
                LogInfo.show_line(cnt, 100000)
                sptc = linec.strip().split()
                spte = linee.strip().split()
                wordc = sptc[0]
                worde = spte[0]
                try:
                    vecc = map(lambda x: float(x), sptc[1:])
                    vece = map(lambda x: float(x), spte[1:])
                    self.sync[wordc] = vecc
                    self.syne_neg[worde] = vece
                except ValueError:
                    LogInfo.logs("[error] %s | %s", sptc[0:3], spte[0:3])
                    continue
        LogInfo.logs("[log] sync/syneneg cause/effect vectors loaded (%d/%d).",
                     len(self.sync), len(self.syne_neg))

        with open("/home/yuchen/CppFiles/Causal/syncneg_wdFin_iter200.txt") as finc, \
                open("/home/yuchen/CppFiles/Causal/syne_wdFin_iter200.txt") as fine:
            # with open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as finc, \
            #         open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as fine:
            cnt = 0
            for linec, linee in zip(finc, fine):
                cnt += 1
                LogInfo.show_line(cnt, 100000)
                sptc = linec.strip().split()
                spte = linee.strip().split()
                wordc = sptc[0]
                worde = spte[0]
                try:
                    vecc = map(lambda x: float(x), sptc[1:])
                    vece = map(lambda x: float(x), spte[1:])
                    self.sync_neg[wordc] = vecc
                    self.syne[worde] = vece
                except ValueError:
                    LogInfo.logs("[error] %s | %s", sptc[0:3], spte[0:3])
                    continue
        LogInfo.logs("[log] syncneg/syne cause/effect vectors loaded (%d/%d).",
                     len(self.sync_neg), len(self.syne))

        # NN, JJ, VB
        with open("/home/yuchen/data/copa_phr.txt") as fin:
            for i in range(1000):
                raw_sentence = fin.readline()
                raw_option1 = fin.readline()
                raw_option2 = fin.readline()
                sentence = map(lambda x: x.split(':')[1],
                               raw_sentence.strip().split())
                option1 = map(lambda x: x.split(':')[1],
                              raw_option1.strip().split())
                option2 = map(lambda x: x.split(':')[1],
                              raw_option2.strip().split())
                self.copa_data.append([sentence, option1, option2])
        LogInfo.logs("[log] copa dataset loaded (%d).", len(self.copa_data))

        with open("/home/yuchen/data/copa_label.txt") as fin:
            for line in fin:
                spt = line.strip().split('\t')
                self.copa_ground.append([spt[1], int(spt[2])])
        LogInfo.logs("[log] copa ground truth loaded (%d).",
                     len(self.copa_ground))
        LogInfo.end_track()
示例#8
0
    def load_data(self):
        """
        load data from files
        :return: 
        """
        LogInfo.begin_track("Loading data...")
        # with open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as finc, \
        #         open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as fine:
        with open("/home/yuchen/CppFiles/Causal/copy_sync_half_200_iter100.txt") as finc, \
                open("/home/yuchen/CppFiles/Causal/copy_syneneg_half_200_iter100.txt") as fine:
            cnt = 0
            for linec, linee in zip(finc, fine):
                cnt += 1
                LogInfo.show_line(cnt, 100000)
                sptc = linec.strip().split()
                spte = linee.strip().split()
                wordc = sptc[0]
                worde = spte[0]
                vecc = map(lambda x: float(x), sptc[1:])
                vece = map(lambda x: float(x), spte[1:])
                self.sync[wordc] = vecc
                self.syne_neg[worde] = vece
        LogInfo.logs("[log] sync/syneneg cause/effect vectors loaded (%d/%d).",
                     len(self.sync), len(self.syne_neg))

        with open("/home/yuchen/CppFiles/Causal/copy_syncneg_half_200_iter100.txt") as finc, \
                open("/home/yuchen/CppFiles/Causal/copy_syne_half_200_iter100.txt") as fine:
            # with open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as finc, \
            #         open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as fine:
            cnt = 0
            for linec, linee in zip(finc, fine):
                cnt += 1
                LogInfo.show_line(cnt, 100000)
                sptc = linec.strip().split()
                spte = linee.strip().split()
                wordc = sptc[0]
                worde = spte[0]
                vecc = map(lambda x: float(x), sptc[1:])
                vece = map(lambda x: float(x), spte[1:])
                self.sync_neg[wordc] = vecc
                self.syne[worde] = vece
        LogInfo.logs("[log] syncneg/syne cause/effect vectors loaded (%d/%d).",
                     len(self.sync_neg), len(self.syne))

        # NN, JJ, VB
        with open("/home/yuchen/data/copa_lem.txt") as fin:
            for i in range(1000):
                raw_sentence = fin.readline()
                raw_option1 = fin.readline()
                raw_option2 = fin.readline()
                sentence = list()
                option1 = list()
                option2 = list()
                for word in raw_sentence.strip().split():
                    if word.startswith('NN') or word.startswith(
                            'JJ') or word.startswith('VB'):
                        sentence.append(word.split(':')[1])
                for word in raw_option1.strip().split():
                    if word.startswith('NN') or word.startswith(
                            'JJ') or word.startswith('VB'):
                        option1.append(word.split(':')[1])
                for word in raw_option2.strip().split():
                    if word.startswith('NN') or word.startswith(
                            'JJ') or word.startswith('VB'):
                        option2.append(word.split(':')[1])

                self.copa_data.append([sentence, option1, option2])
        LogInfo.logs("[log] copa dataset loaded (%d).", len(self.copa_data))

        with open("/home/yuchen/data/copa_label.txt") as fin:
            for line in fin:
                spt = line.strip().split('\t')
                self.copa_ground.append([spt[1], int(spt[2])])
        LogInfo.logs("[log] copa ground truth loaded (%d).",
                     len(self.copa_ground))
        LogInfo.end_track()
示例#9
0
    def process_query(self):
        LogInfo.begin_track("Begin adding tags for queries...")
        fin = codecs.open(self.root_fp + "/query.txt", 'r', encoding='utf-8')
        fout = codecs.open(self.root_fp + "/query_label.txt",
                           'w',
                           encoding='utf-8')
        cnt = 0
        for line in fin:
            spt = line.strip().split()
            new_line = ""
            context = ""
            label = set()
            i = 0
            while i < len(spt):
                if i + 4 < len(spt):
                    str5 = spt[i] + spt[i + 1] + spt[i + 2] + spt[i +
                                                                  3] + spt[i +
                                                                           4]
                    if str5 in self.pinlei_set:
                        LogInfo.logs("Found 5-term pinlei [%s|%s|%s|%s|%s]",
                                     spt[i], spt[i + 1], spt[i + 2],
                                     spt[i + 3], spt[i + 4])
                        label.add(str5)
                        new_line += "[[" + str5 + "]] "
                        i += 5
                        continue
                if i + 3 < len(spt):
                    str4 = spt[i] + spt[i + 1] + spt[i + 2] + spt[i + 3]
                    if str4 in self.pinlei_set:
                        LogInfo.logs("Found 4-term pinlei [%s|%s|%s|%s]",
                                     spt[i], spt[i + 1], spt[i + 2],
                                     spt[i + 3])
                        label.add(str4)
                        new_line += "[[" + str4 + "]] "
                        i += 4
                        continue
                if i + 2 < len(spt):
                    str3 = spt[i] + spt[i + 1] + spt[i + 2]
                    if str3 in self.pinlei_set:
                        LogInfo.logs("Found 3-term pinlei [%s|%s|%s]", spt[i],
                                     spt[i + 1], spt[i + 2])
                        label.add(str3)
                        new_line += "[[" + str3 + "]] "
                        i += 3
                        continue
                if i + 1 < len(spt):
                    str2 = spt[i] + spt[i + 1]
                    if str2 in self.pinlei_set:
                        # LogInfo.logs("Found 2-term pinlei [%s|%s]",
                        #              spt[i], spt[i+1])
                        label.add(str2)
                        new_line += "[[" + str2 + "]] "
                        i += 2
                        continue
                if spt[i] in self.pinlei_set:
                    # LogInfo.logs("Found pinlei [%s]", spt[i])
                    label.add(spt[i])
                    new_line += "[[" + spt[i] + "]] "
                    i += 1
                    continue
                context += spt[i] + " "
                new_line += spt[i] + " "
                i += 1

            if len(label) != 0:
                ret = new_line.strip() + "\t" + \
                      context.strip() + "\t" + \
                      "\t".join(label) + "\n"
            else:
                ret = new_line.strip() + "\n"
            fout.write(ret)
            cnt += 1
            if cnt < 5:
                LogInfo.logs("res ==> (%s)", ret.strip())
            LogInfo.show_line(cnt, 100000)
        fin.close()
        fout.close()
        LogInfo.end_track("Query processed.")
示例#10
0
    fb_path = "/home/kangqi/Freebase/Transform"

    LogInfo.begin_track("Loading wiki-fb entity map...")
    wiki_fb_map = dict()
    cnt = 0
    with open(fb_path + "/GS-cleanWiki-triple.txt") as fin:
        for line in fin:
            spt = line.strip().split('\t')
            if len(spt) < 3:
                continue
            fb_ent = spt[0]
            wiki_ent = spt[2].split('/wiki/')[1][:-1]
            wiki_ent = wiki_ent.lower().replace('_', ' ')
            wiki_fb_map[wiki_ent] = fb_ent
            cnt += 1
            LogInfo.show_line(cnt, 500000)
    LogInfo.end_track("%d pairs in total", cnt)

    LogInfo.begin_track("Loading fb entity pop...")
    fb_ent_pop_map = dict()
    cnt = 0
    with open("/home/xusheng/freebase/top5m.mid") as fin:
        for line in fin:
            spt = line.strip().split('\t')
            if len(spt) < 2:
                continue
            ent = spt[0]
            pop = int(spt[1])
            fb_ent_pop_map[ent] = pop
            cnt += 1
            LogInfo.show_line(cnt, 500000)