示例#1
0
def _csv_iterator(data_path, tokenizer, ngrams, yield_cls=False):
    with io.open(data_path, encoding="utf8") as f:
        reader = unicode_csv_reader(f)
        for row in reader:
            tokens = ' '.join(row[1:])
            tokens = tokenizer(tokens)
            if yield_cls:
                yield int(row[0]) - 1, ngrams_iterator(tokens, ngrams)
            else:
                yield ngrams_iterator(tokens, ngrams)
def _csv_iterator(data_path, ngrams, yield_cls=False):
    tokenizer = get_tokenizer("basic_english")
    with io.open(data_path, encoding="utf8") as f:
        reader = csv.reader(f)
        for row in reader:
            tokens = " ".join(row[1:])
            tokens = tokenizer(tokens)
            if yield_cls:
                yield int(row[0]) - 1, ngrams_iterator(tokens, ngrams)
            else:
                yield ngrams_iterator(tokens, ngrams)
def _imdb_iterator(key, extracted_files, tokenizer, ngrams, yield_cls=False):
    for fname in extracted_files:
        if 'urls' in fname:
            continue
        elif key in fname and ('pos' in fname or 'neg' in fname):
            with io.open(fname, encoding="utf8") as f:
                label = 1 if 'pos' in fname else 0
                if yield_cls:
                    yield label, ngrams_iterator(tokenizer(f.read()), ngrams)
                else:
                    yield ngrams_iterator(tokenizer(f.read()), ngrams)
示例#4
0
def _csv_iterator(data_path, ngrams=1, yield_cls=False):
    tokenizer = get_tokenizer("basic_english")
    toxic = pd.read_csv(data_path)

    for i in range(len(toxic.tweet)):

        tokens = toxic.tweet[i]
        tokens = tokenizer(tokens)
        if yield_cls:
            yield int(row[0]) - 1, ngrams_iterator(tokens, ngrams)
        else:
            yield ngrams_iterator(tokens, ngrams)
示例#5
0
def _csv_iterator(data_path, ngrams, yield_cls=False):
    tokenizer = get_tokenizer("basic_english")
    with io.open(data_path, encoding="utf8") as f:
        reader = unicode_csv_reader(f)
        for row in reader:
            tokens = row[1]
            tokens = tokenizer(tokens)
            if yield_cls:
                yield 1 if int(row[4]) == 3 else 0, ngrams_iterator(
                    tokens, ngrams)
            else:
                yield ngrams_iterator(tokens, ngrams)
示例#6
0
def _csv_iterator(data_path, ngrams, dataset_name=None, yield_cls=False):
    # tokenizer = get_tokenizer("basic_english")
    tokenizer = get_tokenizer("spacy", language="en_core_web_sm")
    with io.open(data_path, encoding="utf8") as f:
        reader = unicode_csv_reader(f, delimiter='\t')
        for row in reader:
            tokens = row[1]
            tokens = tokenizer(tokens)
            if yield_cls:
                label = int(LABELS[dataset_name][row[0]]) - 1
                yield label, ngrams_iterator(tokens, ngrams)
            else:
                yield ngrams_iterator(tokens, ngrams)
示例#7
0
def _csv_iterator(data_path, ngrams, yield_cls=False, label=-1):
    tokenizer = get_tokenizer("basic_english")
    with io.open(data_path, encoding="utf8") as f:
        reader = unicode_csv_reader(f, delimiter="\t")
        for row in reader:
            tokens = ' '.join([row[5]])
            #print(row[5])
            tokens = tokenizer(tokens)

            if yield_cls:
                yield row[7], ngrams_iterator(tokens, ngrams)
            else:
                yield ngrams_iterator(tokens, ngrams)
示例#8
0
def csv_iterator(data_path, ngrams):
    tokenizer = get_tokenizer("basic_english")
    with io.open(data_path, encoding="utf8") as f:
        reader = unicode_csv_reader(f)
        for row in reader:
            tokens = ' '.join(row[1:])
            yield ngrams_iterator(tokenizer(tokens), ngrams)
示例#9
0
def _compute_ngram_counter(tokens, max_n):
    """ Create a Counter with a count of unique n-grams in the tokens list

    Arguments:
        tokens: a list of tokens (typically a string split on whitespaces)
        max_n: the maximum order of n-gram wanted

    Outputs:
        output: a collections.Counter object with the unique n-grams and their
            associated count

    Examples:
        >>> from torchtext.data.metrics import _compute_ngram_counter
        >>> tokens = ['me', 'me', 'you']
        >>> _compute_ngram_counter(tokens, 2)
            Counter({('me',): 2,
             ('you',): 1,
             ('me', 'me'): 1,
             ('me', 'you'): 1,
             ('me', 'me', 'you'): 1})
    """
    assert max_n > 0
    ngrams_counter = collections.Counter(
        tuple(x.split(' ')) for x in ngrams_iterator(tokens, max_n))

    return ngrams_counter
示例#10
0
def text_to_tensor(text, vocab, ngrams):
    tokens = ngrams_iterator(tokenizer(text), ngrams=ngrams)
    token_ids = list(
        filter(lambda x: x is not Vocab.UNK,
               [vocab[token] for token in tokens]))
    tokens = torch.tensor(token_ids)
    return tokens
示例#11
0
def classify(text):
    print('predicting [' + text + ']')

    # normalize input string
    text = re.sub(r'([\u4e00-\u9fff])', r' \1', text)
    l = list(utils.ngrams_iterator(_basic_english_normalize(text), 2))
    l = [[stoi.get(token, 0) for token in l]]

    text = torch.tensor(l)

    with torch.no_grad():
        result = model(text, None)

        # sort result according score
        value, index = (torch.sort(result, descending=True))
        classIdx = []
        scores = []

        print("===========================================")
        # just pick 3 most relevant class
        for i in range(0, 3):
            idx = index[0][i].item() + 1
            score = value[0][i].item()
            print("class:{}, score:{}".format(class_idx_to_name[idx], score))
            classIdx.append(idx)
            scores.append(int(score))
        return classIdx, scores
示例#12
0
    def preprocess(self, data):
        """
        Normalizes the input text for PyTorch model using following basic cleanup operations :
            - remove html tags
            - lowercase all text
            - expand contractions [like I'd -> I would, don't -> do not]
            - remove accented characters
            - remove punctuations
        Converts the normalized text to tensor using the source_vocab.
        Returns a Tensor
        """

        line = data[0]
        text = line.get("data") or line.get("body")
        if isinstance(text, (bytes, bytearray)):
            text = text.decode('utf-8')

        text = self._remove_html_tags(text)
        text = text.lower()
        text = self._expand_contractions(text)
        text = self._remove_accented_characters(text)
        text = self._remove_punctuation(text)
        text = self._tokenize(text)
        text = torch.as_tensor([
            self.source_vocab[token]
            for token in ngrams_iterator(text, self.ngrams)
        ],
                               device=self.device)
        return text
示例#13
0
    def preprocess(self, data):
        """
        Normalizes the input text for PyTorch model using following basic cleanup operations :
            - remove html tags
            - lowercase all text
            - expand contractions [like I'd -> I would, don't -> do not]
            - remove accented characters
            - remove punctuations
        Converts the normalized text to tensor using the source_vocab.
        Returns a Numpy array.
        """
        ngrams = 2

        text = data[0].get("data")
        if text is None:
            text = data[0].get("body")
        text = text.decode('utf-8')

        text = self._remove_html_tags(text)
        text = text.lower()
        text = self._expand_contractions(text)
        text = self._remove_accented_characters(text)
        text = self._remove_punctuation(text)
        text = self._tokenize(text)
        text = torch.tensor([
            self.source_vocab[token]
            for token in ngrams_iterator(text, ngrams)
        ])

        return text
示例#14
0
 def __getitem__(self, i):
     raw_datum: Example = super().__getitem__(i)
     tokens = raw_datum.text
     ngrams = list(ngrams_iterator(tokens, self.ngrams))
     text = self.fields["text"].numericalize([ngrams]).squeeze()
     label = int(self.fields["label"].numericalize([raw_datum.label]))
     return label, text
def testing_predict(model, vocabulary, ngrams):
    tokenizer = get_tokenizer("basic_english")
    ag_news_label = {1: "World", 2: "Sports", 3: "Business", 4: "Sci/Tec"}
    ex_text_str = """
        MEMPHIS, Tenn. – Four days ago, Jon Rahm was enduring the
        season’s worst weather conditions on Sunday at The Open on
        his way to a closing 75 at Royal Portrush, which considering
        the wind and the rain was a respectable showing. Thursday’s
        first round at the WGC-FedEx St. Jude Invitational was another
        story. With temperatures in the mid-80s and hardly any wind,
        the Spaniard was 13 strokes better in a flawless round. Thanks
        to his best putting performance on the PGA Tour, Rahm finished
        with an 8-under 62 for a three-stroke lead, which was even more
        impressive considering he’d never played the front nine at TPC
        Southwind.
        """
    with torch.no_grad():
        text = torch.tensor([
            vocabulary[token]
            for token in ngrams_iterator(tokenizer(ex_text_str), ngrams)
        ])
        output = model(text, torch.tensor([0]))
        print("\nTesting the prediction of sample text:")
        print(ex_text_str)
        print("This is a %s news" % ag_news_label[output.argmax(1).item() + 1])
示例#16
0
def predict(text, model, vocab, ngrams):
    tokenizer = get_tokenizer("basic_english")
    with torch.no_grad():
        text = torch.tensor([vocab[token]
                            for token in ngrams_iterator(tokenizer(text), ngrams)])
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() + 1
def run(raw_data):
    global model, dictionary, tokenizer, ngrams, device

    prev_time = time.time()
    post = json.loads(raw_data)
    incoming_text = post['text']

    with torch.no_grad():
        text = torch.tensor([
            dictionary[token]
            for token in ngrams_iterator(tokenizer(incoming_text), ngrams)
        ])

        output = model(text.to(device), torch.tensor([0]).to(device))

        current_time = time.time()
        inference_time = datetime.timedelta(seconds=current_time - prev_time)

        payload = {
            'time': str(inference_time.total_seconds()),
            'text': incoming_text,
            'scores': output[0].tolist(),
            'rating': output.argmax(1).item() + 1
        }

        print('Input ({}), Prediction ({})'.format(text, payload))
        return payload
示例#18
0
def transform_data(a_dataset, unique_vocab_dict, classifier_name, NGRAMS):
    if classifier_name == 'LR':
        train_X = torch.zeros(len(a_dataset), len(unique_vocab_dict))
        for i in tqdm(range(len(a_dataset))):
            tokens = nltk.word_tokenize(a_dataset[i][0])
            tokens = tokens if NGRAMS == 1 else ngrams_iterator(tokens, NGRAMS)
            for j in tokens:
                if j in [
                        string.punctuation, 'to', 'and', 'the', 'be', 'a',
                        'is', 'that', 'of'
                ]:
                    continue
                try:
                    train_X[i][unique_vocab_dict[j]] += 0.5
                except:
                    pass
        train_Y = torch.Tensor([i[1] for i in a_dataset]).long()
    else:
        train_X = convert_texts_to_ids([x[0] for x in a_dataset],
                                       unique_vocab_dict,
                                       max_seq_length=17,
                                       do_lower_case=False,
                                       sos=False,
                                       eos=False)
        train_X = torch.Tensor(train_X)
        train_Y = torch.Tensor([x[1] for x in a_dataset]).long()
    # Labels = train_Y
    # train_Y = torch.zeros(len(a_dataset), 2)
    # for i in Labels:
    #     train_Y[i] = torch.tensor(np.eye(2)[i])
    return train_X, train_Y
示例#19
0
def _pd_iterator(data_to_parse: np.ndarray,
                 ngrams: int,
                 yield_cls: bool = False):
    """
    :param data_to_parse: array of two colums with label and text
    :param ngrams: amount of ngrams
    :param yield_cls: return text with label or without
    :return: generator needed in future parsing for torch
    """
    tokenizer = get_tokenizer(None)
    for row_id in range(len(data_to_parse)):
        tokens = data_to_parse[row_id][1]
        tokens = tokenizer(tokens)
        if yield_cls:
            yield data_to_parse[row_id][0], ngrams_iterator(tokens, ngrams)
        else:
            yield ngrams_iterator(tokens, ngrams)
示例#20
0
def predict(text):
    tokenizer = get_tokenizer("basic_english")
    vocab = train_dataset.get_vocab()
    with torch.no_grad():
        text = torch.tensor([
            vocab[token] for token in ngrams_iterator(tokenizer(text), NGRAMS)
        ])
        output = model(text, torch.tensor([0]))
        return "Relaxing" if output.argmax(1).item() == 1 else "Not Relaxing"
示例#21
0
def ToEmbed(text, model):
    tokenizer = get_tokenizer("basic_english")
    tokenized_text = tokenizer(text)

    origin_tensor = torch.tensor(
        [model.vocab[token] for token in ngrams_iterator(tokenized_text, 1)])
    origin_tensor = torch.stack([origin_tensor], 0)
    origin_tensor = model.embed(origin_tensor)

    return origin_tensor
示例#22
0
def build_vocab(xlist, NGRAMS, min_count):
    vocabi2w = ['[SOS]', '[EOS]', '[PAD]', '[UNK]']  # A list of unique words
    seen = collections.defaultdict(int)
    for i in tqdm(range(len(xlist))):
        tokens = nltk.word_tokenize(xlist[i][0])
        tokens = tokens if NGRAMS == 1 else ngrams_iterator(tokens, NGRAMS)
        for token in tokens:
            seen[token] += 1
    vocabi2w += [x for x in seen if seen[x] >= min_count]
    vocabw2i = {vocabi2w[x]: x for x in range(len(vocabi2w))}
    return vocabw2i, vocabi2w
示例#23
0
def _data_iterator(data_rows, ngrams, yield_cls=False):
    """[summary]

    Args:
        data_rows ([type]): [description]
        ngrams ([type]): [description]
        yield_cls (bool, optional): [description]. Defaults to False.

    Yields:
        [type]: [description]
    """
    tokenizer = get_tokenizer("basic_english")

    for row in data_rows:
        tokens = ' '.join(row[1:])
        tokens = tokenizer(tokens)
        if yield_cls:
            yield int(row[0]) - 1, ngrams_iterator(tokens, ngrams)
        else:
            yield ngrams_iterator(tokens, ngrams)
示例#24
0
 def predict(self, text):
     tokenizer = get_tokenizer("basic_english")
     with torch.no_grad():
         text = torch.tensor([
             self.vocab[token]
             for token in ngrams_iterator(tokenizer(text), self.ngrams)
         ])
         output = self.model(text, torch.tensor([0]))
         result = output.argmax(1).item()
         label = self.labels[result + 1]
         return label
示例#25
0
def csv_iterator(data_path, ngrams, yield_cls=False):
    """
    加载csv文本文件,并根据原始文本 生成 指定ngram语法的 词汇(token)样本
    Args:
        data_path:
        ngrams:
        yield_cls:

    Returns:

    """
    with io.open(data_path, encoding="utf8") as f:
        reader = unicode_csv_reader(f)
        for row in reader:
            tokens = ' '.join(row[1:])
            tokens = tokenizer(tokens)
            if yield_cls:
                yield int(row[0]) - 1, ngrams_iterator(tokens, ngrams)
            else:
                yield ngrams_iterator(tokens, ngrams)
示例#26
0
def predict(text, model, vocab, ngrams):
    # tokenizer = get_tokenizer("basic_english")
    tokenizer = text.split()
    print('tokenizer', tokenizer, '\n')
    with torch.no_grad():
        text = torch.tensor(
            [vocab[token] for token in ngrams_iterator(tokenizer, ngrams)])
        #print("word_set: ", word_set, '\n')     # 테스트 위해 사용
        #print("text: ", text, "\n")     # 테스트 위해 사용
        output = model(text, torch.tensor([0]))
        print("output: ", output, '\n')
        return output.argmax(1).item() + 1
示例#27
0
def predict_review_sentiment(text):
    # Convert text to tensor
    text = torch.tensor(
        [VOCAB[token] for token in ngrams_iterator(TOKENIZER(text), NGRAMS)]
    )

    # Compute output
    # TODO compute the output of the model. Note that you will have to give it a 0 as an offset.
    output = ...
    confidences = torch.softmax(output, dim=1)
    return confidences.squeeze()[
        1
    ].item()  # Class 1 corresponds to confidence of positive
示例#28
0
def _csv_iterator(data_path,
                  ngrams,
                  skip_header=True,
                  yield_cls=False,
                  label_col=6,
                  token_col=[1, 5],
                  label_mapping={
                      "simulation": 0,
                      "hardware": 1,
                      "edge_computing": 2
                  }):
    tokenizer = get_tokenizer("spacy", "en_core_web_sm")
    with io.open(data_path, encoding="utf8") as f:
        reader = unicode_csv_reader(f)
        if skip_header:
            next(reader, None)
        for row in reader:
            tokens = ' '.join([j for i, j in enumerate(row) if i in token_col])
            tokens = tokenizer(tokens)
            if yield_cls:
                yield label_mapping[row[label_col]], ngrams_iterator(
                    tokens, ngrams)
            else:
                yield ngrams_iterator(tokens, ngrams)
def predict_fn(sentence, model_dict):
    logger.info('predict_fn: Predicting for {}.'.format(sentence))

    model = model_dict['model']
    dictionary = model_dict['dictionary']

    with torch.no_grad():
        sentence_tensor = torch.tensor([
            dictionary[token]
            for token in ngrams_iterator(_tokenizer(sentence), _ngrams)
        ])
        output = model(sentence_tensor, torch.tensor([0]))
        label = output.argmax(1).item() + 1
        logger.info('predict_fn: Prediction result is {}.'.format(label))
        return label
示例#30
0
def predict(text, model, vocab, ngrams):
    tokenizer = get_tokenizer("basic_english")
    with torch.no_grad():
        text = torch.tensor([
            vocab[token] for token in ngrams_iterator(tokenizer(text), ngrams)
        ])
        output = model(text, torch.tensor([0]))
        ret = output > THRESHOLD
        result = []
        print(ret)
        cnt = 0
        for r in ret[0]:
            if r:
                result.append(cnt)
            cnt += 1
        return result