Пример #1
0
def main():
    txt_path = './data.txt'
    txt_list = load_txt(txt_path)
    data_path = './dataset.txt'
    dataset_txt = load_dataset(data_path)
    fin_result_list = []
    # 对划分的内容逐个处理
    print('{} sentence to do.'.format(len(txt_list)))
    for txt_line in txt_list:
        result_list = []
        seg_list = seg_words(txt_line)
        print('Sentence index : {}.'.format(txt_list.index(txt_line) + 1))
        for seg_word in tqdm.tqdm(list(set(seg_list))):
            if len(seg_word) <= 1:
                continue
            result = judge_txt(seg_word, dataset_txt)
            result_list.append((seg_word, result))
            # print('{} : {}'.format(seg_word, result))
        fin_result_list.extend(result_list)
    write_txt(fin_result_list, './result.txt')
    result_list = load_txt('./result.txt')
    result_list = [
        result.replace('(', '').replace(')', '').replace(' ', '').split(',')
        for result in result_list
    ]
    top_list = pridict(result_list, 10)
    print(top_list)
def get_train_test_data(train_label_fea_file, test_label_fea_file):
    train_label_fea = load_txt(train_label_fea_file)[1:]
    test_label_fea = load_txt(test_label_fea_file)[1:]
    all_labels = []
    # train
    train_doc_wordID = {}
    train_label = {}
    train_feature_str = {}
    for i in xrange(len(train_label_fea)):
        line = train_label_fea[i]
        labels_str, feature_str = line.split(' ', 1)
        # label
        labels_str = labels_str.split(',')
        try:
            labels = [int(label) for label in labels_str]
        except ValueError:
            continue
        all_labels.append(labels)
        train_label[i] = labels
        # feature
        train_feature_str[i] = feature_str
        # word
        word_tfidf = {}
        for str in feature_str.split(' '):
            word, tfidf = str.split(':')
            word_tfidf[int(word)] = float(tfidf)
        train_doc_wordID[i] = word_tfidf
    # test
    test_doc_wordID = {}
    test_label = {}
    test_feature_str = {}
    for i in xrange(len(test_label_fea)):
        line = test_label_fea[i]
        labels_str, feature_str = line.split(' ', 1)
        # label
        labels_str = labels_str.split(',')
        try:
            labels = [int(label) for label in labels_str]
        except ValueError:
            continue
        test_label[i] = labels
        # feature
        test_feature_str[i] = feature_str
        # word
        word_tfidf = {}
        for str in feature_str.split(' '):
            word, tfidf = str.split(':')
            word_tfidf[int(word)] = float(tfidf)
        test_doc_wordID[i] = word_tfidf
    all_labels = np.unique(np.concatenate(all_labels)).tolist()
    dump_pickle(train_feature_str, data_source_path + 'train_feature.pkl')
    dump_pickle(test_feature_str, data_source_path + 'test_feature.pkl')
    dump_pickle(train_doc_wordID, data_source_path + 'train_doc_wordID.pkl')
    dump_pickle(train_label, data_source_path + 'train_label.pkl')
    dump_pickle(test_doc_wordID, data_source_path + 'test_doc_wordID.pkl')
    dump_pickle(test_label, data_source_path + 'test_label.pkl')
    return all_labels, train_doc_wordID, train_label, test_doc_wordID, test_label, train_feature_str, test_feature_str
Пример #3
0
def create_labels_words(path=path_dataset + "semcor+omsti.gold.key.txt"):
    ###############################################################################
    # This function, given the gold file, creates a dictionary of labels (babelnet id, Wndomain, Lexname)
    # for each ambiguous words
    #
    # Input:
    #   path: path of the gold file
    #
    # Output:
    #   dict_sensekey: dictionary of labels
    ###############################################################################

    sense_keys = [sensekey.split() for sensekey in utils.load_txt(path)]

    dict_sensekey = {}

    for list_info in sense_keys:
        # take the synset from the sense key
        synset = wn.lemma_from_key(list_info[1]).synset()
        # take the wordnet id from the sense key
        wn_id = "wn:" + str(synset.offset()).zfill(8) + synset.pos()
        bn_id = Wordnet_match[wn_id]

        try:
            dict_sensekey[list_info[0]] = [
                bn_id, Wndomain_match[bn_id], lexname_match[bn_id]
            ]

        # add the factotum label to all the words which don't have a wndomain label
        except:
            dict_sensekey[list_info[0]] = [
                bn_id, "factotum", lexname_match[bn_id]
            ]

    return dict_sensekey
Пример #4
0
 def load(cls, filename):
     buffer = utils.load_txt(filename)
     p1 = buffer[0]
     m = int(buffer[1])
     n = int(buffer[2])
     alpha = buffer[3:m+3]
     a = buffer[3+m:3+m+n]
     b = buffer[3+m+n:3+m+n+n]
     return cls(p1, alpha, a, b)
def create_labels_words(path_input, save_gt=False):
    ###############################################################################
    # This function, given the gold file, creates a dictionary of labels for each
    # ambiguous words (babelnet id, Wndomain, Lexname) and  if save_gt is True,
    # this function saves the ground truth
    #
    # Input:
    #   path: path of the gold file
    #   save_gt: if True, the ground truth is saved, otherwise no
    #
    # Output:
    #   dict_sensekey: dictionary of labels
    ###############################################################################

    sense_keys = [sensekey.split() for sensekey in utils.load_txt(path_input)]
    dict_sensekey = {}

    for list_info in sense_keys:

        # take the synset from the sense key
        synset = wn.lemma_from_key(list_info[1]).synset()
        # take the wordnet id from the sense key
        wn_id = "wn:" + str(synset.offset()).zfill(8) + synset.pos()
        bn_id = Wordnet_match[wn_id]

        try:
            dict_sensekey[list_info[0]] = [
                bn_id, Wndomain_match[bn_id], lexname_match[bn_id]
            ]

        # add the factotum label to all the words which don't have a wndomain label
        except:
            dict_sensekey[list_info[0]] = [
                bn_id, "factotum", lexname_match[bn_id]
            ]

    # save the ground truth
    if save_gt:

        name_words = list(dict_sensekey.keys())
        gt_words = list(dict_sensekey.values())
        combined = list(zip(name_words, gt_words))

        utils.save_txt(
            list(map(lambda word: word[0] + " " + word[1][0], combined)),
            "../resources/Test/fine_grained_gt.txt")
        utils.save_txt(
            list(map(lambda word: word[0] + " " + word[1][1], combined)),
            "../resources/Test/wndomain_gt.txt")
        utils.save_txt(
            list(map(lambda word: word[0] + " " + word[1][2], combined)),
            "../resources/Test/lexname_gt.txt")

        return None

    return dict_sensekey
Пример #6
0
def load_dataset(dataset_path):
    # 加载自己写的简单的语料库
    dataset_txt = []
    dataset_list = load_txt(dataset_path)
    for line in dataset_list:
        temp_list = line.split('|')
        for i in temp_list:
            if len(i) >= 2:
                dataset_txt.append(i)
    return dataset_txt
Пример #7
0
 def __init__(self, model_path, device, pretrained_model_path, model_name, sent_embd_path, word_embd_path='', stopword_path='./lib/hit_stopwords.txt', num_classes=2):
     # super().__init__()
     self.device = device
     self.model = load_model(model_path, device, pretrained_model_path, model_name, num_classes)
     print('Load classification model {}'.format(model_path))
     self.stopword_lst = load_txt(stopword_path)
     # self.w2i = pickle.load(open(os.path.join(embd_path, 'w2i.pkl'),'rb'))
     # self.i2w = pickle.load(open(os.path.join(embd_path, 'i2w.pkl'),'rb'))
     # self.embd_matrix = np.load(os.path.join(embd_path, 'i2w.pkl'))
     self.sent_embd = hub.load(sent_embd_path)
     # spacy.prefer_gpu()
     if word_embd_path:
         self.nlp = spacy.load(word_embd_path)
     else:
         self.nlp = spacy.load('zh_core_web_lg')
     print('Textfooler init success!')
Пример #8
0
def load(save_path, max_size, skip=0):
    code = load_txt(save_path + 'code.txt', skip)
    print('loading code finished')
    parent_matrix = load_txt(save_path + 'parent_matrix.txt', skip).view(-1, max_size, max_size)
    print('loading parent_matrix finished')
    brother_matrix = load_txt(save_path + 'brother_matrix.txt', skip).view(-1, max_size, max_size)
    print('loading brother_matrix finished')
    rel_par_ids = load_txt(save_path + 'relative_parents.txt', skip).view(-1, max_size, max_size)
    print('loading rel_par_ids finished')
    rel_bro_ids = load_txt(save_path + 'relative_brothers.txt', skip).view(-1, max_size, max_size)
    print('loading rel_bro_ids finished')
    comments = load_txt(save_path + 'comments.txt', skip)
    print('loading comments finished')

    return code, parent_matrix, brother_matrix, rel_par_ids, rel_bro_ids, comments
Пример #9
0
def prepare_ans_conditional_data(
    data_file,
    out_dir,
    out_prefix,
    n_ans_per_txt=10,
    use_no_ans=False,
    use_only_no_ans=False,
):
    """ Given a text file, extract possible answer candidates for each line.

    Will generate n_ans_per_text instances for each line in txt
    """

    txt_w_ans_file = f"{out_dir}/{out_prefix}_w_{n_ans_per_txt}ans.txt"
    txt_file = f"{out_dir}/{out_prefix}.txt"
    ans_file = f"{out_dir}/{out_prefix}_{n_ans_per_txt}ans.txt"

    print(
        f"Preparing answer conditional question generation data for {data_file}"
    )
    if use_only_no_ans:
        print("\twith ONLY NO_ANS!")
    elif use_no_ans:
        print("\twith NO_ANS option!")
    else:
        print("\twithout NO_ANS option!")

    all_txts = load_txt(data_file)
    print("Extracting entities...")
    all_anss = extract_ans(all_txts)
    print("\tDone!")
    print(f"\tMin ans count: {min(len(a) for a in all_anss)}")
    print(f"\tMax ans count: {max(len(a) for a in all_anss)}")

    print("Writing...")
    txts_w_ans = list()
    all_txt = list()
    all_ans = list()
    for txt, anss in zip(all_txts, all_anss):
        if use_only_no_ans:
            anss = [NO_ANS_TOK] * n_ans_per_txt
        elif use_no_ans:
            if len(anss) > n_ans_per_txt - 1:
                anss = random.sample(anss, k=n_ans_per_txt - 1)
            anss += [NO_ANS_TOK] * (n_ans_per_txt - len(anss))
            assert NO_ANS_TOK in anss, ipdb.set_trace()
        else:
            if len(anss) < n_ans_per_txt:
                extra_anss = random.choices(anss, k=n_ans_per_txt - len(anss))
                anss += extra_anss
            if len(anss) > n_ans_per_txt:
                anss = random.sample(anss, n_ans_per_txt)
            assert len(anss) == n_ans_per_txt, ipdb.set_trace()

        for ans in anss:
            txts_w_ans.append(f"{txt} {ANS_TOK} {ans}")
            all_txt.append(txt)
            all_ans.append(ans)

    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    with open(txt_w_ans_file, 'w') as out_fh:
        for txt in txts_w_ans:
            out_fh.write(f'{txt}\n')
    with open(txt_file, 'w') as out_fh:
        for txt in all_txt:
            out_fh.write(f'{txt}\n')
    with open(ans_file, 'w') as out_fh:
        for ans in all_ans:
            out_fh.write(f'{ans}\n')
    print("\tDone!")
    print(f"\tWrote {len(txts_w_ans)} sentences to {txt_w_ans_file}")
Пример #10
0
from main import *
import config
import utils
import visited
import globalvar as gl

gl._init_globalar()
config._init()
v = utils.load_txt(config.get_value("filename"))
for i in range(30):
    print("run...", i)
    config._init()
    visited._init_visited_set()
    filename = config.get_value("filename")

    main(data=v,
         wmin=config.get_value("wmin"),
         wmax=config.get_value("wmax"),
         pop_size=config.get_value("pop_size"),
         max_evaluations=config.get_value("max_evaluations"),
         display=True)

    if (config.get_value("SHOW_CONVERGENCE_RATE")):
        utils.show_convergence_rate()
    if (config.get_value("SHOW_SWARM_DISTRIBUTION")):
        utils.show_swarm_distribution()
print("Best_SUM= ", gl.get_value("gbest_sum"))
def create_input_Train():
    ###############################################################################
    # This function creates the training input sentences, the labels and sentences
    # with the candidate synsets
    #
    # Input:
    #   None
    #
    # Output:
    #   : list with sentences, labels and candidate synsets
    #   : list with ids of the candidate synsets
    ###############################################################################

    # list of padded sentences (string sentences)
    X_train = [
        sentence.split() + [''] * (MAX_LENGTH - len(sentence.split())) if
        len(sentence.split()) <= MAX_LENGTH else sentence.split()[:MAX_LENGTH]
        for sentence in utils.load_txt(
            "../resources/Train/semcor_omsti_string.txt")
    ]

    # list of padded sentences (ids sentences)
    X_train_int = [
        list(map(int, sentence.split()))
        for sentence in utils.load_txt("../resources/Train/semcor_omsti.txt")
    ]
    X_train_int = pad_sequences(X_train_int,
                                truncating='pre',
                                padding='post',
                                maxlen=MAX_LENGTH)

    # lists of padded labels
    Y_POS = [
        list(map(int, label.split()))
        for label in utils.load_txt("../resources/Train/POS.txt")
    ]
    Y_POS = pad_sequences(Y_POS,
                          truncating='pre',
                          padding='post',
                          maxlen=MAX_LENGTH)

    Y_wndomain = [
        list(map(int, label.split()))
        for label in utils.load_txt("../resources/Train/Wndomain.txt")
    ]
    Y_wndomain = pad_sequences(Y_wndomain,
                               truncating='pre',
                               padding='post',
                               maxlen=MAX_LENGTH)

    Y_lex = [
        list(map(int, label.split()))
        for label in utils.load_txt("../resources/Train/lexnames.txt")
    ]
    Y_lex = pad_sequences(Y_lex,
                          truncating='pre',
                          padding='post',
                          maxlen=MAX_LENGTH)

    # lists of padded candidate synsets sentences
    X_candidate_wndomain, ids_candidates_wndomain = cs.create_candidates(
        path_train_json, type_=1)
    X_candidate_wndomain = pad_sequences(X_candidate_wndomain,
                                         truncating='pre',
                                         padding='post',
                                         maxlen=MAX_LENGTH)

    X_candidate_lexname, ids_candidates_lexname = cs.create_candidates(
        path_train_json, type_=2)
    X_candidate_lexname = pad_sequences(X_candidate_lexname,
                                        truncating='pre',
                                        padding='post',
                                        maxlen=MAX_LENGTH)

    return [
        X_train, X_train_int, Y_wndomain, Y_lex, Y_POS, X_candidate_wndomain,
        X_candidate_lexname
    ], [ids_candidates_wndomain, ids_candidates_lexname]
Пример #12
0
def aggregate_questions_from_txt(out_dir,
                                 src_txt_file,
                                 gen_txt_file,
                                 gen_qst_file,
                                 gen_prob_file=None,
                                 gen_ans_file=None,
                                 gen_prd_file=None,
                                 src_w_trg_txt_file=None,
                                 use_all_qsts=False,
                                 use_act_anss=False,
                                 use_exp_anss=False,
                                 n_gen_qsts=10,
                                 n_ans=10,
                                 n_qsts=20):
    """ Extract questions generated from src, trg, and gen
    with the corresponding field from fseq logs (one log/txt) and write to jsonl.
    Each fseq log should have the txt field as 'source' (S)
    and the questions as generated 'hypotheses' (H)

    args:
        - src_txt_file: txt file or source inputs (e.g. articles for summarization)
            - src_w_trg_txt_file (optional): special src inputs with the trg re-appended for XSUM
        - gen_txt_file: txt file of model-generated targets (e.g. summaries for summarization)
        - gen_qst_file: txt file of questions generated conditioned on src/gen
        - gen_prob_file: txt file of {src/gen} question probabilities according to QG model
        - gen_prd_file (optional): txt file of answers predicted by QA model on src/gen_qst_file

        n_ans: the number of answer candidates per text
        n_gen_qsts: the number of questions generated per (text, answer) pair
        n_qsts: the number of questions to use for each example
        use_all_qsts: use all questions
        use_act_anss: filter out [NO_ANS] questions
        use_exp_anss: filter out questions where prediction doesn't match expected answer
    """

    assert not (use_exp_anss and (gen_ans_file is None)
                ), "Trying to use expected answers, but not provided any!"
    assert not (
        use_act_anss and (gen_ans_file is None)
    ), "Trying to use predicted answers, but not provided expected answers!"
    assert not (use_act_anss and (gen_prd_file is None)
                ), "Trying to use predicted answers, but not provided any!"

    files = {
        "src": {
            "txt": src_txt_file
        },
        "gen": {
            "txt": gen_txt_file,
            "qst": gen_qst_file,
            "prb": gen_prob_file,
            "ans": gen_ans_file,
            "prd": gen_prd_file
        },
    }

    # the number of original examples (not counting answer candidates)
    n_exs = None
    # number of total generated questions per example (across answer candidates and generated questions)
    n_qsts_per_ex = n_ans * n_gen_qsts

    # load all data
    all_txts, all_qsts = {}, {}
    for txt_fld, field_files in files.items():
        txts = load_txt(field_files["txt"])
        all_txts[txt_fld] = txts
        if txt_fld == "src" and src_w_trg_txt_file is not None:
            txts = load_txt(src_w_trg_txt_file)
            all_txts["src_w_trg"] = txts

        if n_exs is None:  # infer number of examples
            n_exs = len(txts)
        else:
            assert len(
                txts
            ) == n_exs, "Different numbers of txts detected! Expected {n_exs} but found {len(txts)} for {txt_fld}."

        # load questions, probabilities, (expected) answers only based on generation
        if txt_fld != "gen":
            continue
        qsts = load_txt(field_files["qst"])
        prbs = [float(f) for f in load_txt(field_files["prb"])
                ]  #if field_files["prb"] is not None else list()
        anss = load_txt(field_files["ans"]) if use_exp_anss else []
        # optionally load QA model predictions
        if use_act_anss:
            raw_prds = json.load(open(field_files["prd"]))
            prds = [raw_prds[str(i)] for i in range(len(raw_prds))]
        else:
            prds = list()
        all_qsts[txt_fld] = (qsts, prbs, anss, prds)
    print(
        f"Formatting QA data for {n_exs} examples, filtering {n_qsts_per_ex} questions per example to {n_qsts}"
    )

    # build the data then write out a SQuAD format file
    # dummy iterator in case we want to condition questions on something else outside of gen
    for qst_src in all_qsts:
        qsts, prbs, anss, prds = all_qsts[qst_src]
        all_clean_qsts = []

        # Filter questions
        # Extract questions assuming there's a constant number per example and in order
        for i in tqdm(range(n_exs), desc="Filtering questions"):
            cand_qsts = qsts[(i * n_qsts_per_ex):((i + 1) * n_qsts_per_ex)]
            cand_prbs = prbs[(i * n_qsts_per_ex):((i + 1) * n_qsts_per_ex)]
            cand_anss = anss[(i * n_ans):((i + 1) * n_ans)] if anss else None
            cand_prds = prds[(i *
                              n_qsts_per_ex):((i + 1) *
                                              n_qsts_per_ex)] if prds else None
            if not use_all_qsts:
                ret = filter_qsts(cand_qsts,
                                  n_qsts,
                                  prbs=cand_prbs,
                                  reverse_prob=False,
                                  exp_anss=cand_anss,
                                  act_anss=cand_prds)
            else:
                ret = {
                    'qsts': cand_qsts,
                    'n_clean_qsts': len(cand_qsts),
                    'n_qsts_w_ans': None,
                    'n_qsts_w_match_ans': None,
                }
            clean_qsts = ret['qsts']
            for qst in clean_qsts:
                assert not isinstance(qst,
                                      list), "List instead of string detected!"
            all_clean_qsts.append(clean_qsts)

        # Construct data in SQuAD-like format, using both src (article) and gen (model generation) as context
        for txt_fld in all_txts:
            if use_all_qsts and txt_fld != "gen":
                # case where we want to get answers for all our questions
                # and we want to just use the generations to do that, assuming we generated from generations
                continue
            txts = all_txts[txt_fld]
            raw_data = {}

            for i in tqdm(range(n_exs), desc="Formatting data"):
                if len(txts) == len(qsts):
                    txt = txts[i * n_ans].split()
                elif len(txts) < len(qsts):
                    assert len(qsts) / len(txts) == n_qsts_per_ex, \
                            f"Expected constant number of questions ({n_qsts_per_ex}) per example! Found {len(qsts)} total questions for {len(txts)} examples"
                    txt = txts[i].split()
                else:
                    raise IndexError(
                        "Number of questions should be weakly greater than number of examples!"
                    )
                clean_qsts = all_clean_qsts[i]
                raw_data[i] = {txt_fld: txt, "hypotheses": clean_qsts}

            data = format_squad(raw_data, context=txt_fld, ctx_split=True)
            out_file = f"{out_dir}/{txt_fld}.json"
            print(f"Writing to {out_file}")
            json.dump(data, open(out_file, "w", encoding="utf-8"))
Пример #13
0
import numpy as np
import os
import PIL
import torch
import torchvision

from PIL import Image
from torch.utils.data import Subset
from torchvision import datasets

from utils import load_txt

corruptions = load_txt('./src/corruptions.txt')


class CIFAR10C(datasets.VisionDataset):
    def __init__(self, root :str, name :str,
                 transform=None, target_transform=None):
        assert name in corruptions
        super(CIFAR10C, self).__init__(
            root, transform=transform,
            target_transform=target_transform
        )
        data_path = os.path.join(root, name + '.npy')
        target_path = os.path.join(root, 'labels.npy')
        
        self.data = np.load(data_path)
        self.targets = np.load(target_path)
        
    def __getitem__(self, index):
        img, targets = self.data[index], self.targets[index]
Пример #14
0
def get_train_test_data(vocab,
                        asin_map_file, text_data_file,
                        train_asin_map_file, test_asin_map_file,
                        train_asin_label_fea_file, test_asin_label_fea_file):
    vocab = load_pickle('../material/' + vocab)
    vocab_dict = dict(zip(vocab, range(len(vocab))))
    asin_map = load_pickle(asin_map_file)
    asin_map_dict = dict(zip(asin_map, range(len(asin_map))))
    text_data = load_pickle(text_data_file)
    #
    train_asin_label_fea = load_txt(train_asin_label_fea_file)[1:]
    test_asin_label_fea = load_txt(test_asin_label_fea_file)[1:]
    # remove all replica in both train and test asin map
    train_asin_map = get_asin_from_map_file(train_asin_map_file)
    test_asin_map = get_asin_from_map_file(test_asin_map_file)
    train_asin_map_dict = dict(zip(train_asin_map, range(len(train_asin_map))))
    test_asin_map_dict = dict(zip(test_asin_map, range(len(test_asin_map))))
    #
    train_rep = [item for item, count in collections.Counter(train_asin_map).items() if count > 1]
    test_rep = [item for item, count in collections.Counter(test_asin_map).items() if count > 1]
    train_asin = set(train_asin_map) - set(train_rep)
    test_asin = set(test_asin_map) - set(test_rep)
    # remove asins from test data which appear in train data
    test_asin = test_asin - train_asin
    train_asin = list(train_asin)
    test_asin = list(test_asin)
    # train
    print 'get train data'
    all_labels = []
    train_doc_wordID = {}
    train_label = {}
    train_feature = {}
    k = 0
    for t in train_asin:
        k = k + 1
        if k % 10000 == 0:
            print k
        try:
            #ind = asin_map.index(t)
            ind = asin_map_dict[t]
        except:
            continue
        #train_data[ind] = clean_str(text_data[ind])
        text = clean_str(text_data[ind])
        wordID = get_wordID_from_vocab_dict_for_raw_text(text, vocab_dict)
        if not wordID:
            continue
        train_doc_wordID[ind] = wordID
        #
        #line = train_asin_label_fea[train_asin_map.index(t)]
        line = train_asin_label_fea[train_asin_map_dict[t]]
        labels_str, feature_str = line.split(' ', 1)
        labels = [int(label) for label in labels_str.split(',')]
        all_labels.append(labels)
        train_label[ind] = labels
        train_feature[ind] = feature_str
    # test
    print 'get test data'
    test_doc_wordID = {}
    test_label = {}
    test_feature = {}
    k = 0
    for t in test_asin:
        k = k + 1
        if k % 10000 == 0:
            print k
        try:
            ind = asin_map_dict[t]
            #ind = asin_map.index(t)
        except:
            continue
        #test_data[ind] = clean_str(text_data[ind])
        text = clean_str(text_data[ind])
        wordID = get_wordID_from_vocab_dict_for_raw_text(text, vocab_dict)
        if not wordID:
            continue
        test_doc_wordID[ind] = wordID
        #
        #line = test_asin_label_fea[test_asin_map.index(t)]
        line = test_asin_label_fea[test_asin_map_dict[t]]
        labels_str, feature_str = line.split(' ', 1)
        labels = [int(label) for label in labels_str.split(',')]
        test_label[ind] = labels
        test_feature[ind] = feature_str
    all_labels = np.unique(np.concatenate(all_labels)).tolist()
    print 'dump train/test data'
    dump_pickle(train_feature, data_source_path + 'train_feature.pkl')
    dump_pickle(test_feature, data_source_path + 'test_feature.pkl')
    dump_pickle(train_doc_wordID, data_source_path + 'train_doc_wordID.pkl')
    dump_pickle(train_label, data_source_path + 'train_label.pkl')
    dump_pickle(test_doc_wordID, data_source_path + 'test_doc_wordID.pkl')
    dump_pickle(test_label, data_source_path + 'test_label.pkl')
    return all_labels, train_doc_wordID, train_label, test_doc_wordID, test_label, train_feature, test_feature
Пример #15
0
    def __init__(self, split, exp_dict, root):
        self.transform_function = ut.bgrNormalize()
        self.collate_fn = ut.collate_fn_0_4
        self.split = split

        self.img_names = []
        self.mask_names = []
        self.cls_names = []

        # train
        train_img_names = ut.load_txt(root +
                                      "/ImageSets/Segmentation/train.txt")
        val_img_names = ut.load_txt(root + "/ImageSets/Segmentation/val.txt")

        assert len(train_img_names) == 1464
        assert len(val_img_names) == 1449

        if split == 'train':
            for name in train_img_names:
                name = name.replace("\n", "")
                name_img = os.path.join(root, 'JPEGImages/' + name + '.jpg')
                name_mask = os.path.join(root,
                                         'SegmentationObject/' + name + '.png')
                name_cls = os.path.join(root,
                                        'SegmentationClass/' + name + '.png')

                self.img_names += [name_img]
                self.mask_names += [name_mask]
                self.cls_names += [name_cls]

            self.img_names.sort()
            self.cls_names.sort()
            self.mask_names.sort()

            self.img_names = np.array(self.img_names)
            self.cls_names = np.array(self.cls_names)
            self.mask_names = np.array(self.mask_names)

        elif split in ['val']:
            for k, name in enumerate(val_img_names):
                name = name.replace("\n", "")
                name_img = os.path.join(root, 'JPEGImages/' + name + '.jpg')
                name_mask = os.path.join(root,
                                         'SegmentationObject/' + name + '.png')
                name_cls = os.path.join(root,
                                        'SegmentationClass/' + name + '.png')

                assert os.path.exists(name_img)
                assert os.path.exists(name_mask)
                assert os.path.exists(name_cls)

                self.img_names += [name_img]
                self.mask_names += [name_mask]
                self.cls_names += [name_cls]

        self.n_classes = 21
        self.ignore_index = 255
        self.exp_dict = exp_dict

        if split == "val":
            annList_path = "./datasets/annotations/val_gt_annList.json"
            self.annList_path = annList_path

        self.sm_proposal_dict = ut.load_json("./datasets/proposal_dict.json")
        self.prm_point_dict = ut.load_json("./datasets/prm_point_dict.json")
Пример #16
0
import pprint
import torch
import torchvision
import tqdm

from glob import glob
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import transforms, datasets
from tqdm import tqdm

from utils import load_txt, accuracy, create_barplot, get_fname, AverageMeter
from models.resnet import ResNet56
from dataset import CIFAR10C

CORRUPTIONS = load_txt('./src/corruptions.txt')
MEAN = [0.49139968, 0.48215841, 0.44653091]
STD = [0.24703223, 0.24348513, 0.26158784]


def main(opt, weight_path: str):

    device = torch.device(opt.gpu_id)

    # model
    if opt.arch == 'resnet56':
        model = ResNet56()
    else:
        raise ValueError()
    try:
        model.load_state_dict(torch.load(weight_path, map_location='cpu'))
Пример #17
0
def get_candidate_labels(path, out_path, type, format):
    train_titles_file = path + 'train_map.txt'
    test_titles_file = path + 'test_map.txt'
    # train_candidate_label_file = path + type + '_candidate/candidate_train.mat'
    # test_candidate_label_file = path + type + '_candidate/candidate_test.mat'
    #label_index_file = path + 'label_dict.pkl'
    index_label_file = path + 'all_labels.pkl'
    train_titles = load_txt(train_titles_file)
    test_titles = load_txt(test_titles_file)
    index_label = load_pickle(index_label_file)
    train_candidate_labels = {}
    test_candidate_labels = {}
    if format == 'mat':
        train_candidate_label_file = path + type + '_candidate/candidate_train.mat'
        test_candidate_label_file = path + type + '_candidate/candidate_test.mat'
        train_candidate_all = sio.loadmat(train_candidate_label_file)['candidate_train']
        test_candidate_all = sio.loadmat(test_candidate_label_file)['candidate_test']
        for i in xrange(len(train_titles)):
            pid = train_titles[i].strip()
            pid = int(pid)
            candidate_label_index = train_candidate_all[i]
            candidate_labels = [index_label[ind] for ind in candidate_label_index]
            train_candidate_labels[pid] = candidate_labels
        for i in xrange(len(test_titles)):
            pid = test_titles[i].strip()
            pid = int(pid)
            candidate_label_index = test_candidate_all[i]
            candidate_labels = [index_label[ind] for ind in candidate_label_index]
            test_candidate_labels[pid] = candidate_labels
    elif format == 'txt':
        train_candidate_label_file = path + type + '_candidate/train_score_mat.txt'
        test_candidate_label_file = path + type + '_candidate/test_score_mat.txt'
        train_candidate_all = load_txt(train_candidate_label_file)[1:]
        test_candidate_all = load_txt(test_candidate_label_file)[1:]
        for i in xrange(len(train_titles)):
            pid = int(train_titles[i].strip())
            candidate_label_line = train_candidate_all[i].strip()
            temp = []
            for l_s in candidate_label_line.split(' '):
                l_, s_ = l_s.split(':')
                ll = index_label[int(l_)]
                temp.append((ll, float(s_)))
            sorted_temp = sorted(temp, key=lambda e: e[1], reverse=True)
            train_candidate_labels[pid] = dict(sorted_temp[:50])
            # candidate_label_score = {}
            # for l_s in candidate_label_line.split(' ')[:30]:
            #     l_, s_ = l_s.split(':')
            #     ll = index_label[int(l_)]
            #     candidate_label_score[ll] = float(s_)
            # train_candidate_labels[pid] = candidate_label_score
        for i in xrange(len(test_titles)):
            pid = int(test_titles[i].strip())
            candidate_label_line = test_candidate_all[i].strip()
            temp = []
            for l_s in candidate_label_line.split(' '):
                l_, s_ = l_s.split(':')
                ll = index_label[int(l_)]
                temp.append((ll, float(s_)))
            sorted_temp = sorted(temp, key=lambda e: e[1], reverse=True)
            test_candidate_labels[pid] = dict(sorted_temp[:50])
            # candidate_label_score = {}
            # for l_s in candidate_label_line.split(' ')[:30]:
            #     l_, s_ = l_s.split(':')
            #     ll = index_label[int(l_)]
            #     candidate_label_score[ll] = float(s_)
            # test_candidate_labels[pid] = candidate_label_score

    dump_pickle(train_candidate_labels, out_path + type + '_candidate/train_candidate_label.pkl')
    dump_pickle(test_candidate_labels, out_path + type + '_candidate/test_candidate_label.pkl')
Пример #18
0
 def __init__(self):
     ([jieba.add_word(d) for d in load_txt('./cut/my_word.txt')])
     self.stop_dict = set([d for d in load_txt('./cut/my_stop_word.txt')])