def sub_process(numIters,pool,id):
        dictionary = Dictionary()
        dictionary.add_word('<pad>')  # add padding word
        with open(args.output+str(id), 'w') as fout:
            qdar = tqdm.tqdm(range(numIters),total= numIters,ascii=True)
            for i in qdar:
                # for item in pool:
                item = pool[i]


                # words = tokenizer(' '.join(item['text'].split()))
                words = SymSpellCheck(item['text'])


                data = {
                    'label': int(item['stars']) - 1,
                    'text': list(map(lambda x: proc_token(x), words))
                }
                fout.write(json.dumps(data) + '\n')
                fout.flush()
            # for item in data['text']:
            #     dictionary.add_word(item)
            # qdar.set_postfix(dictSize=str(len(dictionary)))
        with open(args.dict+str(id), 'w') as fout:  # save dictionary for fast next process
            fout.write(json.dumps(dictionary.idx2word) + '\n')
Exemplo n.º 2
0
def get_wiki_curated_NDWdata():
    X, y = get_wiki_curated_data()
    train_idx = []
    dd = Dictionary()
    for idx, label in enumerate(y):
        if label == '':
            continue
        if not dd.isDW(label):
            train_idx.append(idx)
    y = [y[i] for i in train_idx]
    X = X[train_idx]
    return X, y
Exemplo n.º 3
0
def get_wiki_bow_DWdata():
    X, y = get_wiki_bow_data()
    train_idx = []
    NDW_idx = []
    dd = Dictionary()
    for idx, label in enumerate(y):
        if label == '':
            continue
        if dd.isDW(label):
            train_idx.append(idx)
        else:
            NDW_idx.append(idx)
    NDW_y = [y[i] for i in NDW_idx]
    NDW_X = X[NDW_idx]
    y = [y[i] for i in train_idx]
    X = X[train_idx]
    return X, y, NDW_X, NDW_y
Exemplo n.º 4
0
def read_hownet(hownet_path):
    hownet = Dictionary()
    with open(hownet_path, encoding='UTF8') as f:
        id = -1
        str = ''
        kdml = ''
        for line in f.readlines():
            if line.startswith('NO'):
                stage = 1
            elif line.startswith('W_C'):
                stage = 2
            elif line.startswith('DEF'):
                stage = 3
            else:
                stage = 0

            if stage == 1:
                id = int(line[4:-1])
            elif stage == 2:
                str = line[4:-1]
            elif stage == 3:
                kdml = line[4:-1]
                hownet.add_sense(id, str, kdml)
    return hownet
Exemplo n.º 5
0
 def __init__(self):
     self.assistant = Assistant(lang='de')
     self.dictionary = Dictionary()
     set_lang('de')
Exemplo n.º 6
0
class Galileo:
    def __init__(self):
        self.assistant = Assistant(lang='de')
        self.dictionary = Dictionary()
        set_lang('de')

    def start_service(self):
        while True:
            if "galileo" in self.assistant.listen():
                self.assistant.speak(
                    'Hallo, möchtest du eine Erklärung erhalten oder eine Erklärung eingeben?'
                )
                answer = self.assistant.listen()
                if not self.check_answer(answer):
                    continue
                if 'erhalten' in answer:
                    self.process_explanation()
                elif 'eingeben' in answer:
                    self.get_explanation()
                else:
                    self.assistant.speak(
                        'Tut mir leid, das kann ich noch nicht!')

    def process_explanation(self):
        self.assistant.speak("Gib bitte ein Thema an!")
        answer = self.assistant.listen()
        if not self.check_answer(answer):
            return
        for topic in self.dictionary.get_dict():
            if topic in answer:
                self.assistant.speak(
                    f'Die Erklärung zu dem Thema {topic} lautet: {self.dictionary.get_dict()[topic]}!'
                )
                return
        try:
            self.assistant.speak(
                page(search(answer)[0]).summary.split('\n')[0])
        except (IndexError | PageError):
            self.assistant.speak(
                f'Zu diesem Thema wurde noch keine Erklärung eingegeben, deshalb kann ich dir leider nicht helfen.'
            )

    def get_explanation(self):
        self.assistant.speak('Wie lautet dein Thema?')
        first_answer = self.assistant.listen()
        if not self.check_answer(first_answer):
            return
        self.assistant.speak('Gib die Erklärung zum entsprechenden Thema ein!')
        second_answer = self.assistant.listen()
        if not self.check_answer(second_answer):
            return
        self.assistant.speak(
            f'Deine Erklärung für das Thema {first_answer} war {second_answer}! Ist das für dich in Ordnung?'
        )
        third_answer = self.assistant.listen()
        if not self.check_answer(third_answer):
            return
        if 'ja' in third_answer:
            self.dictionary.add_value(first_answer, second_answer)
            self.assistant.speak(
                'Danke, die anderen Kinder werden sich freuen!')
        else:
            self.assistant.speak(
                'Ok, dann breche ich den aktuellen Vorgang ab!')

    def check_answer(self, answer):
        if answer == '':
            self.assistant.speak(
                'Da du nicht mehr mit mir geredet hast, breche ich den aktuellen Vorgang ab!'
            )
            return False
        else:
            return True
Exemplo n.º 7
0
        if torch.cuda.is_available():
            print("WARNING: CUDA device detected, continue to use cpu device!")
            device = torch.device('cpu')
            torch.manual_seed(args.seed)
        else:
            device = torch.device('cpu')
            torch.manual_seed(args.seed)

    random.seed(args.seed)

    # Load Dictionary
    assert os.path.exists(args.train_data), "No training data detected!"
    assert os.path.exists(args.val_data), "No validation data detected!"
    assert os.path.exists(args.test_data), "No test data detected!"
    print('Begin to load the dictionary.')
    dictionary = Dictionary(path=args.dictionary)

    # n_token: number of tokens in the dictionary
    n_token = len(dictionary)

    # initialize the classifier; interesting way to use dictionary as input; more readable and better use it in the future
    # important: remember to change the type when switching to another model
    if args.encoder == "CNN":
        model = Classifier_CNN({
            'dropout': args.dropout,
            'ntoken': n_token,
            'ninp': args.emsize,
            'encoder': args.encoder,
            'nfc': args.nfc,
            'dictionary': dictionary,
            'word-vector': args.word_vector,
Exemplo n.º 8
0
def main(argv):
    import gzip, os
    from itertools import ifilter
    from util import ureader, uwriter, closing, Dictionary, SpanishSet, EnglishSet
    
    if '-h' in sys.argv: 
        usage()
     
    # Reverse source and target during collection
    reverse = '-r' in argv
    _contextual = '-c' in argv
    _probabilities = '-p' in argv
    
    if not '-f' in argv:
        usage()
        
    p_args = 1 + argv.index('-f')    
    src_lang, tgt_lang, in_fname, out_fname = argv[p_args:p_args + 4]

    # The input stream
    def check_record(categories):
        def _do_check((sent_id, src_id, src_word, src_pos, tgt_id, tgt_word, tgt_pos)):
            "Check <en, es> is a pair of words with pos in categories."
            pos = src_pos[0] # same as tgt_pos[0]
            return pos in categories and is_word(src_word, 'en') and is_word(tgt_word, 'es')
        return _do_check
            
    print 'Creating dictionaries ...',
    dictionary = Dictionary(en=EnglishSet(), es=SpanishSet())
    is_word = dictionary.is_word
    print 'done.'
    
    def streams_for(categories, mode):
        return map(lambda c: uwriter(gzip.open('{}.{}.gz'.format(out_fname, c.lower()), mode)), categories)

    def noncontextual():
        if '-c' in argv or '-p' in argv:
            # Just to be sure: -c and -n are mutually exclusive
            usage()
            
        categories = 'nv'
        idx = categories.index
        def fields(rec): 
            return idx(rec[3][0]), rec[2], rec[5]
        
        print 'Reading', in_fname, '...',
        in_stream = ifilter(check_record(categories), map(lambda l: l.lower().split(), ureader(gzip.open(in_fname))))
        print 'done.'

        # Output files, one for each category
        with closing(*streams_for(categories, 'wb')) as out_streams:
            print 'Processing noncontextually', in_stream, 'for', categories, '...',
            in_recs = imap(fields, in_stream)
            collected = collect(in_recs, categories, reverse=reverse)
            process(collected, out_streams, src_lang, tgt_lang, categories, reverse)
            print 'done.'
    
    def contextual():
        """Contextual, with probabilities.
        """
        from itertools import product
        from util import grouped
        fname = argv[1 + argv.index('-c')]
        if not os.access(fname, os.F_OK):
            print 'Cannot access', fname
            usage()
        
        print 'Reading relations ...',
        lines = ureader(gzip.open(fname)).readlines()
        print 'done.'

        print 'Extracting records ...',
        recs = [l[:-1].lower().split() for l in lines]
        print 'done.'

        print 'Indexing relations ...',
        ctx = dict(((int(dep[0]), int(dep[2])), dep) for dep in recs)
        print 'done.'
        
        UNK = u'__unk__'
        
        print 'Gathering categories ...',
        categories = list(set(r[3] for r in recs)) + [UNK]
        print '({})'.format(u', '.join(categories)), 'done.'
        
        idx = categories.index 
        src, tgt, src_id = (2, 5, 1) if not reverse else (5, 2, 4)
        def fields(rec):
            s_id, w_id = int(rec[0]), int(rec[src_id])
            try:
                dep = ctx[(s_id, w_id)]
            except KeyError:
                dep = [UNK] * 6
            return idx(dep[3]), (rec[src], dep[4]), rec[tgt] 
        
        print 'Reading', in_fname, '...',
        in_stream = ifilter(check_record('v'), map(lambda l: l.lower().split(), ureader(gzip.open(in_fname))))
        print 'done.'

        # Output files, one for each category
        dotjoin = '.'.join
        fnames = [dotjoin(p) for p in product(categories, ('px', 'pmi'))] 
        with closing(*streams_for(fnames, 'wb')) as out_streams:
            print 'Processing', in_fname, 'for', categories, '...',
            in_recs = imap(fields, in_stream)
            collected = collect3(in_recs, categories, reverse=reverse)
            print 'done.\nOutputting files', ', '.join(fnames), '...' 
            process_p2(collected, grouped(2, out_streams), src_lang, tgt_lang)
            print 'done.'
            
    def contextual_noprob():
        """Contextual, without probabilities.
        """
        fname = argv[1 + argv.index('-c')]
        if not os.access(fname, os.F_OK):
            print 'Cannot access', fname
            usage()
            
        print "Contextual, no probabilities."
        
        print 'Reading relations ...',
        lines = ureader(gzip.open(fname)).readlines()
        print 'done.'

        print 'Extracting records ...',
        recs = [l[:-1].lower().split() for l in lines]
        print 'done.'

        print 'Indexing relations ...',
        # Record schema: sentence_id, noun_id, verb_id, noun, verb
        ctx = dict(((int(dep[0]), int(dep[2])), dep) for dep in recs)
        print 'done.'
        
        UNK = u'__unk__'
        
        print 'Gathering categories ...',
        categories = list(set(r[3] for r in recs)) + [UNK]
        print '({})'.format(u', '.join(categories)), 'done.'
        
        index = categories.index 
        src, tgt, src_id = (2, 5, 1) if not reverse else (5, 2, 4)
        def fields(rec):
            s_id, w_id = int(rec[0]), int(rec[src_id])
            try:
                dep = ctx[(s_id, w_id)]
            except KeyError:
                dep = [UNK] * 6
            return index(dep[3]), (rec[src], rec[tgt]), dep[4] 
        
        print 'Reading', in_fname, '...',
        in_stream = ifilter(check_record('v'), map(lambda l: l.lower().split(), ureader(gzip.open(in_fname))))
        print 'done.'

        # Output files, one for each category
        with closing(*streams_for(categories, 'wb')) as out_streams:
            print 'Processing', in_fname, 'for', categories, '...',
            in_recs = imap(fields, in_stream)
            collected = collect(in_recs, categories)
            print 'done.'
            
            assert len(collected) == len(out_streams), '{} != {}'.format(collected, out_streams)
            
            print 'Outputting files', ', '.join(categories), '...' 
            process_counts(collected, out_streams, src_lang, tgt_lang)
            print 'done.'
            
            
    def probabilities():
        if '-c' in argv or '-n' in argv:
            # Just to be sure: -c and -n are mutually exclusive
            usage()
            
        src, tgt = (2, 5) if not reverse else (5, 2)    
        
        def fields(rec): 
            return rec[3][0], rec[src], rec[tgt]
        
#        categories = ['v.px', 'v.pmi'] 
        categories = ['n.px', 'n.pmi'] 

        print 'Reading', in_fname, '...',
        in_stream = ifilter(check_record('n'), map(lambda l: l.lower().split(), ureader(gzip.open(in_fname))))
        print 'done.'

        # Output files, one for each category
        with closing(*streams_for(categories, 'wb')) as out_streams:
            in_recs = imap(fields, in_stream)
            print 'Processing probabilities in', in_fname, 'for', categories, '...',
            collected = collect2(in_recs, reverse=reverse)
            print 'done.'
            
            print 'Outputting files ...',
            process_p(collected, out_streams, src_lang, tgt_lang)
            
            print 'done.'
    
    if not _contextual:
        noncontextual()
    elif not _probabilities:
        contextual_noprob()
    else:
        probabilities()
import random
from util import Dictionary
import spacy


if __name__ == '__main__':
    parser = argparse.ArgumentParser('Tokenizer')
    parser.add_argument('--input', type=str, default='', help='input file')
    parser.add_argument('--output', type=str, default='', help='output file')
    parser.add_argument('--labels', type=str, default='', help='label file')
    parser.add_argument('--dict', type=str, default='', help='dictionary file')
    parser.add_argument('--label-data', action='store_true', help='to parse label file into json format')
    parser.add_argument('--shuffle', action='store_true', help='output shuffled data to file')
    args = parser.parse_args()
    tokenizer = spacy.load('en_core_web_md')
    dictionary = Dictionary()
    dictionary.add_word('<pad>')  # add padding word
    lab2int = {}
    int2lab = {}
    with open(args.labels, 'r') as labfile:
        for line in labfile:
            labint, labtext = line.strip().split('\t')
            labint = int(labint)
            lab2int[labtext] = labint
            int2lab[labint] = labtext
    with open(args.output, 'w') as fout:
        lines = open(args.input).readlines()
        if args.shuffle: random.shuffle(lines)
        for i, line in enumerate(lines):
            if not line.startswith("#STARTDIALOGUE"):
                # data: input<tab>label<tab>response<tab>interp<tab>correct<tab>...
Exemplo n.º 10
0
def main(cfg):
    # Set the random seed manually for reproducibility.
    torch.manual_seed(cfg.seed)
    if torch.cuda.is_available():
        if not cfg.cuda:
            print(
                "WARNING: You have a CUDA device, so you should probably run with --cuda"
            )
        else:
            torch.cuda.manual_seed(cfg.seed)
    random.seed(cfg.seed)

    # Load Dictionary
    assert os.path.exists(cfg.data.train_data)
    assert os.path.exists(cfg.data.val_data)
    print('Begin to load the dictionary.')
    global dictionary
    dictionary = Dictionary(path=cfg.data.dictionary)

    global best_val_loss
    global best_acc
    best_val_loss = None
    best_acc = None

    n_token = len(dictionary)

    global model
    model = Classifier({
        'dropout': cfg.model.dropout,
        'ntoken': n_token,
        'nlayers': cfg.model.nlayers,
        'nhid': cfg.model.nhid,
        'ninp': cfg.model.emsize,
        'pooling': 'all',
        'attention-unit': cfg.model.attention_unit,
        'attention-hops': cfg.model.attention_hops,
        'nfc': cfg.model.nfc,
        'dictionary': dictionary,
        'word-vector': cfg.data.word_vector,
        'class-number': cfg.class_number
    })
    if cfg.cuda:
        model = model.cuda()

    global I
    I = torch.zeros(cfg.training.batch_size, cfg.model.attention_hops,
                    cfg.model.attention_hops)
    for i in range(cfg.training.batch_size):
        for j in range(cfg.model.attention_hops):
            I.data[i][j][j] = 1
    if cfg.cuda:
        I = I.cuda()

    global criterion
    global optimizer
    criterion = nn.CrossEntropyLoss()
    if cfg.training.optimizer == 'Adam':
        optimizer = optim.Adam(model.parameters(),
                               lr=cfg.training.lr,
                               betas=[0.9, 0.999],
                               eps=1e-8,
                               weight_decay=0)
    elif cfg.training.optimizer == 'SGD':
        optimizer = optim.SGD(model.parameters(),
                              lr=cfg.training.lr,
                              momentum=0.9,
                              weight_decay=0.01)
    else:
        raise Exception('For other optimizers, please add it yourself. '
                        'supported ones are: SGD and Adam.')
    print('Begin to load data.')
    global data_train
    data_train = open(cfg.data.train_data).readlines()
    global data_val
    data_val = open(cfg.data.val_data).readlines()
    try:
        for epoch in range(cfg.training.epochs):
            train(epoch, cfg)
    except KeyboardInterrupt:
        print('-' * 89)
        print('Exit from training early.')
        data_val = open(cfg.data.test_data).readlines()
        evaluate_start_time = time.time()
        test_loss, acc = evaluate(cfg)
        print('-' * 89)
        fmt = '| test | time: {:5.2f}s | test loss (pure) {:5.4f} | Acc {:8.4f}'
        print(fmt.format((time.time() - evaluate_start_time), test_loss, acc))
        print('-' * 89)
        exit(0)
Exemplo n.º 11
0
def load_data_set(vocab_size, dataset_type, level='l3'):
    """
        Loads the dataset.
        Args:
            dataset_type: Dbpedia or WIKI or WOS
			vocab_size: {int} size of the vocabulary
        Returns:
            x_train: {df} with col_names=['text','label']
            x_val: {df} df['text'][0] is an np array with indices of words
            x_test: {df} df['label'][0] is an np.int with indice of the label category
            word_to_id  : {dict} words mapped to indices
            cat2id: {dict} categories mapped to indices
        """

    save_list_str = [
        "x_train", "x_val", "x_test", "dictionary", "dataLoader.cat2id"
    ]
    return_list = []
    if os.path.exists("./data/%s.x_train.p" % dataset_type):
        print("---loading pre-process %s data---" % dataset_type)
        # load already processed data
        for item in save_list_str:
            with open("./data/%s." % dataset_type + item + ".p", "rb") as f:
                return_list.append(pickle.load(f))
        print("----finish data loading----{%d} train,{%d} val, {%d} test" %
              (len(return_list[0]), len(return_list[1]), len(return_list[2])))
        print("{%d} words in dictionary, {%d} classes----" %
              (len(return_list[3]), len(return_list[4])))

        dictionary = return_list[3]
        data_loc = '/home/ml/ksinha4/mlp/hier-class/data'
        dataLoader = Data_Utility()
        dataLoader.cat2id = return_list[4]
        pdb.set_trace()
        x_test = dataLoader.read(data_loc=data_loc,
                                 file_name="wos_data_n_test.csv",
                                 column='l2')
        x_test['label'] = dataLoader.transfer_cat_to_id(x_test['label'])
        return_list[2] = x_test

        return return_list

    print("----initial %s data loading and processing----" % dataset_type)
    dataLoader = Data_Utility()
    if dataset_type == "DBpedia":
        data_loc = '/home/ml/ksinha4/mlp/hier-class/data'
        x_train = dataLoader.read(data_loc=data_loc,
                                  file_name="df_small_train.csv",
                                  column=level)
        x_test = dataLoader.read(data_loc=data_loc,
                                 file_name="df_small_test.csv",
                                 column=level)
    elif dataset_type == "WIKI":
        data_loc = '/home/ml/ksinha4/datasets/data_WIKI'
        x_train = dataLoader.read(data_loc=data_loc,
                                  file_name="full_docs_2_train.csv",
                                  column=level)
        x_test = dataLoader.read(data_loc=data_loc,
                                 file_name="full_docs_2_test.csv",
                                 column=level)
        # "/home/ml/ksinha4/datasets/data_WOS/WebOfScience/WOS46985"
    elif dataset_type == "WOS":
        data_loc = '/home/ml/ksinha4/mlp/hier-class/data'
        x_train = dataLoader.read(data_loc=data_loc,
                                  file_name="wos_data_n_train.csv",
                                  column=level)
        x_test = dataLoader.read(data_loc=data_loc,
                                 file_name="wos_data_n_test.csv",
                                 column=level)
    else:
        raise Exception('this dataset type is not implemented yet')
    x_val = x_train[:int(0.1 * len(x_train))]
    x_train = x_train[int(0.1 * len(x_train)):]
    print("----finish data loading----{%d} train,{%d} val, {%d} test" %
          (len(x_train), len(x_val), len(x_test)))

    # processing dictionary and cat2id
    dictionary = Dictionary()
    dictionary.word2idx, dictionary.idx2word = dataLoader.assign_word_ids(
        x_train['text'].append(x_val['text']), vocab_size=vocab_size)
    dataLoader.assign_category_ids(list(x_train['label']) + \
                                   list(x_val['label']) + list(x_test['label']))
    x_train['label'] = dataLoader.transfer_cat_to_id(x_train['label'])
    x_val['label'] = dataLoader.transfer_cat_to_id(x_val['label'])
    x_test['label'] = dataLoader.transfer_cat_to_id(x_test['label'])
    print("----processed {%d} word_2_id, {%d}cat_2_id----" %\
          (len(dictionary.word2idx),len(dataLoader.cat2id)))

    # save the processed files in pickle
    save_list = [x_train, x_val, x_test, dictionary, dataLoader.cat2id]
    for i in range(len(save_list_str)):
        if not os.path.exists("./data"):
            os.mkdir("./data")
        with open("./data/%s.%s.p" % (dataset_type, save_list_str[i]),
                  'wb') as f:
            pickle.dump(save_list[i], f)
    return x_train, x_val, x_test, dictionary, dataLoader.cat2id
from util import Dictionary
import json

print('Begin to load the dictionary.')
dictList = []
for i in range(16):
	dictList.append(Dictionary(path='./Data/data_clean/dict'+str(i)))

for i in range(15):
	for word in list(dictList[i+1].word2idx.keys()):
		dictList[0].add_word(word)

print('dict size: '+str(len(dictList[0])))

with open('./Data/data_clean/dictall', 'w') as fout:  # save dictionary for fast next process
    fout.write(json.dumps(dictList[0].idx2word) + '\n')

with open('./Data/data_clean/trainsetall', "w") as fout:
    for i in range(16):
        with open('./Data/data_clean/trainset'+str(i), "r") as infile:
            lines = infile.readlines()
            for line in lines:
            	fout.write(line)