def __getitem__(self, idx): label, guid, text_a, text_b = self.df.iloc[idx, :] tokens_1 = self.tokenizer.tokenize(convert_to_unicode(text_a)) tokens_2 = self.tokenizer.tokenize(convert_to_unicode(text_b)) _truncate_seq_pair(tokens_1, tokens_2, self.max_seq_len - 3) # assemble [CLS]+text_a+[SEP] word_pieces = ["[CLS]"] word_pieces += tokens_1 + ["[SEP]"] len_1 = len(tokens_1) + 2 # assemble [CLS]+sent1+[SEP]+text_b+[SEP] word_pieces += tokens_2 + ["[SEP]"] len_2 = len(tokens_2) + 1 ids = self.tokenizer.convert_tokens_to_ids(word_pieces) tokens_tensor = torch.LongTensor(ids) segments_tensor = torch.LongTensor([0] * len_1 + [1] * len_2) # labels label_id = self.label_map[label] label_tensor = torch.tensor(label_id) # label_tensor = torch.IntTensor(label_id) return (tokens_tensor, segments_tensor, label_tensor)
def _create_examples(self, lines, base = False): examples = [] # equation = 1 for n,data in enumerate(lines): # lines: id, summary, text # if n > 0: guid = data[0].replace('\ufeff','') src = convert_to_unicode(data[1].replace('N','N ').replace('C','C ')) tgt = convert_to_unicode(data[2]).replace('N','N ').replace('C','C ') try: self.answers[int(guid)] = \ list(set([float(answer) for answer in data[4].split(' ')])) except: self.answers[int(guid)] = '' temnum = {} for dn,number in enumerate(data[5].split(' ')): temnum['n{}'.format(dn)]= number self.numbers[int(guid)] = temnum self.false_equations[int(guid)] = [] if not base: dep = convert_to_unicode(data[3]) numdep = convert_to_unicode(data[-1]) examples.append(InputExample(guid=guid, src=src, tgt=tgt, dep=[int(f) for f in dep.split(' ') if f != ''] + [-10], numdep=[int(f) for f in numdep.split(' ') if f != ''])) else: examples.append(InputExample(guid=guid, src=src, tgt=tgt)) return examples
def _create_examples(self, lines): examples = [] for data in lines: # lines: id, summary, text guid = data[0] src = convert_to_unicode(data[2]) tgt = convert_to_unicode(data[1]) examples.append(InputExample(guid=guid, src=src, tgt=tgt)) return examples
def take_action(self, parsed_args): try: hostname = None if parsed_args.hostname: hostname = parsed_args.hostname.strip() hostname = utils.convert_to_unicode(hostname) inventory = Inventory.load() if hostname: host = inventory.get_host(hostname) if not host: _host_not_found(self.log, hostname) data = [] host_groups = inventory.get_host_groups() if host_groups: if hostname: data.append((hostname, host_groups[hostname])) else: for (hostname, groupnames) in host_groups.items(): data.append((hostname, groupnames)) else: data.append(('', '')) return (('Host', 'Groups'), sorted(data)) except CommandError as e: raise e except Exception as e: raise Exception(traceback.format_exc())
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # Only the test set has a header guid = "%s-%s" % (set_type, i) text = utils.convert_to_unicode(line.strip()) labels = self._labels_words(text) text = re.sub(u'\s+', '', text.strip()) examples.append(InputExample(guid=guid, text=text, labels=labels)) return examples
def take_action(self, parsed_args): try: hostname = parsed_args.hostname.strip() hostname = utils.convert_to_unicode(hostname) inventory = Inventory.load() inventory.remove_host(hostname) Inventory.save(inventory) except CommandError as e: raise e except Exception as e: raise Exception(traceback.format_exc())
def take_action(self, parsed_args): try: if not parsed_args.hostname and not parsed_args.file: raise CommandError('Hostname or hosts info file path ' + 'is required') if parsed_args.hostname and parsed_args.file: raise CommandError('Hostname and hosts info file path ' + 'cannot both be present') inventory = Inventory.load() if parsed_args.file: # multi-host setup via xml file hosts_data = self.get_yml_data(parsed_args.file.strip()) inventory.setup_hosts(hosts_data) else: # single host setup hostname = parsed_args.hostname.strip() hostname = utils.convert_to_unicode(hostname) if not inventory.get_host(hostname): _host_not_found(self.log, hostname) check_ok = inventory.check_host(hostname, True) if check_ok: self.log.info( 'Skipping setup of host (%s) as check is ok' % hostname) return True if parsed_args.insecure: password = parsed_args.insecure.strip() else: setup_user = get_setup_user() password = getpass.getpass('%s password for %s: ' % (setup_user, hostname)) password = utils.convert_to_unicode(password) inventory.setup_host(hostname, password) except CommandError as e: raise e except Exception as e: raise Exception(traceback.format_exc())
def __getitem__(self, idx): sent1, sent2, score = self.df.iloc[idx, 7:10] score_tensor = torch.tensor(score, dtype=torch.float32) tokens_1 = self.tokenizer.tokenize(convert_to_unicode(sent1)) tokens_2 = self.tokenizer.tokenize(convert_to_unicode(sent2)) _truncate_seq_pair(tokens_1, tokens_2, self.max_seq_len - 3) # assemble [CLS]+sent1+[SEP] word_pieces = ["[CLS]"] word_pieces += tokens_1 + ["[SEP]"] len_1 = len(tokens_1) + 2 # assemble [CLS]+sent1+[SEP]+sent2+[SEP] word_pieces += tokens_2 + ["[SEP]"] len_2 = len(tokens_2) + 1 ids = self.tokenizer.convert_tokens_to_ids(word_pieces) tokens_tensor = torch.LongTensor(ids) segments_tensor = torch.LongTensor([0] * len_1 + [1] * len_2) return (tokens_tensor, segments_tensor, score_tensor)
def take_action(self, parsed_args): try: hostname = parsed_args.hostname.strip() hostname = utils.convert_to_unicode(hostname) inventory = Inventory.load() if not inventory.get_host(hostname): _host_not_found(self.log, hostname) inventory.check_host(hostname) except CommandError as e: raise e except Exception as e: raise Exception(traceback.format_exc())
def trans_type(file, is_train): if is_train == 'test': sentences = [] with open(file) as reader: data = json.load(reader) for entry in data: text = entry['text'] sentences.append(text) reader.close() with open(data_dir + is_train + '_sentences.txt', 'w') as writer: for sentence in sentences: writer.write(sentence + '\n') writer.close() else: with open(file) as reader: data = json.load(reader) sentences = [] intents = [] domains = [] role_labels = [] for entry in data: text = entry['text'] text = utils.convert_to_unicode(text) domain = entry['domain'] intent = entry['intent'] slots = entry['slots'] sentences.append(text) domains.append(domain) intents.append(intent) sentence_roled = sentence_role(text, slots) # label sentence using slots role_labels.append(sentence_roled) reader.close() with open(data_dir + is_train + '_''sentences.txt', 'w') as writer: for sentence in sentences: writer.write(sentence + '\n') writer.close() with open(data_dir + is_train + '_' + 'intents.txt', 'w') as writer: for intent in intents: writer.write(str(intent) + '\n') writer.close() with open(data_dir + is_train + '_' + 'domains.txt', 'w') as writer: for domain in domains: writer.write(domain + '\n') writer.close() with open(data_dir + is_train + '_' + 'role_labels.txt', 'w') as writer: for role_label in role_labels: writer.write(role_label + '\n') writer.close()
def _create_examples(lines, set_type=None): """Creates examples for the training and dev sets.""" # re_ENUM = re.compile(r"([-.a-zA-Z0-9]+)") re_ENUM = re.compile(r'(([-–+])?\d+(([.·])\d+)?%?|([0-9_.·]*[A-Za-z]+[0-9_.·]*)+)') converter = opencc.OpenCC('t2s') def _labels_words(p_text_segment): inside_tokens = [] inside_labels = [] for segment in p_text_segment: hyper_tokens = segment.split() segment_tokens = [] for hyper_token in hyper_tokens: hyper_token = hyper_token.strip() if len(hyper_token) > 0: is_chinese = False for c in hyper_token: if process.process_utils.is_cjk_char(ord(c)): is_chinese = True break if is_chinese: segment_tokens.extend(list(hyper_token)) else: segment_tokens.append(hyper_token) inside_tokens.extend(segment_tokens) if len(segment_tokens) == 1: inside_labels.extend(["A"]) elif len(segment_tokens) > 1: inside_labels.extend(["BS"] + ["A"] * (len(segment_tokens) - 2) + ["ES"]) return inside_tokens, inside_labels for (i, line) in enumerate(lines): # Only the test set has a header line = convert_to_unicode(line.strip()) text = str.lower(process.process_utils.strQ2B(line)) text = converter.convert(text) text = re_ENUM.sub(" \\1 ", text) text_segment = text.split("☃") tokens, labels = _labels_words(text_segment) o_text = re.sub(r"\s|☃", "", line) offset = 0 o_tokens = [] for token in tokens: o_tokens.append(o_text[offset: offset + len(token)]) offset += len(token) yield InputExample(guid=o_tokens, text=tokens, labels=labels)
def load_dict(dictionary_files): """Loads a vocabulary file into a dictionary.""" dictionary = collections.OrderedDict() dictionary_files = dictionary_files.split(",") for dictionary_file in dictionary_files: if not str.isspace(dictionary_file): with tf.gfile.GFile(dictionary_file, "r") as reader: while True: token = utils.convert_to_unicode(reader.readline()) if not token: break token = token.strip().split(" ") if len(token) == 2: dictionary[token[0]] = token[1] else: dictionary[token[0]] = 1 return dictionary
def _create_examples(lines, set_type): """Creates examples for the training and dev sets.""" tf.logging.info(f"creating {set_type} examples") text = [] labels = [] for line in lines: # Only the test set has a header o_line = line line = utils.convert_to_unicode(strQ2B(line.strip())) char_info = line.split() if len(char_info) != 3: if len(char_info) != 0: raise ValueError(o_line) if len(text) != 0: yield InputExample(guid="", text=text, labels=labels) text = [] labels = [] else: text.append(char_info[0].strip()) labels.append(char_info[2].strip())
def load_vocab(vocab_files, preserve_token=None): """Loads a vocabulary file into a dictionary.""" if preserve_token is None: preserve_token = [] vocab = collections.OrderedDict() index = 0 if preserve_token is not None: for token in preserve_token: vocab[token] = index index += 1 vocab_files = vocab_files.split(",") for vocab_file in vocab_files: with tf.gfile.GFile(vocab_file, "r") as reader: while True: token = utils.convert_to_unicode(reader.readline()) if not token: break token = token.strip() if token not in vocab: vocab[token] = index index += 1 return vocab
def tokenize(self, text): """Tokenizes a piece of text.""" text = utils.convert_to_unicode(text) text = self._clean_text(text) # This was added on November 1st, 2018 for the multilingual and Chinese # models. This is also applied to the English models now, but it doesn't # matter since the English models were not trained on any Chinese data # and generally don't have any Chinese data in them (there are Chinese # characters in the vocabulary because Wikipedia does have some Chinese # words in the English Wikipedia.). text = self._tokenize_chinese_chars(text) orig_tokens = whitespace_tokenize(text) split_tokens = [] for token in orig_tokens: if self.do_lower_case: token = token.lower() token = self._run_strip_accents(token) split_tokens.extend(self._run_split_on_punc(token)) output_tokens = whitespace_tokenize(" ".join(split_tokens)) return output_tokens