def test_pad_sequences_vector():
    a = [[[1, 1]],
         [[2, 1], [2, 2]],
         [[3, 1], [3, 2], [3, 3]]]

    # test padding
    b = sequence.pad_sequences(a, maxlen=3, padding='pre')
    assert_allclose(b, [[[0, 0], [0, 0], [1, 1]],
                        [[0, 0], [2, 1], [2, 2]],
                        [[3, 1], [3, 2], [3, 3]]])
    b = sequence.pad_sequences(a, maxlen=3, padding='post')
    assert_allclose(b, [[[1, 1], [0, 0], [0, 0]],
                        [[2, 1], [2, 2], [0, 0]],
                        [[3, 1], [3, 2], [3, 3]]])

    # test truncating
    b = sequence.pad_sequences(a, maxlen=2, truncating='pre')
    assert_allclose(b, [[[0, 0], [1, 1]],
                        [[2, 1], [2, 2]],
                        [[3, 2], [3, 3]]])

    b = sequence.pad_sequences(a, maxlen=2, truncating='post')
    assert_allclose(b, [[[0, 0], [1, 1]],
                        [[2, 1], [2, 2]],
                        [[3, 1], [3, 2]]])

    # test value
    b = sequence.pad_sequences(a, maxlen=3, value=1)
    assert_allclose(b, [[[1, 1], [1, 1], [1, 1]],
                        [[1, 1], [2, 1], [2, 2]],
                        [[3, 1], [3, 2], [3, 3]]])
예제 #2
0
    def get_train_test_data(self, split_factor=0.8):
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(self.headlines)
        train_sequences = tokenizer.texts_to_sequences(self.headlines)
        self.word_list = tokenizer.word_index
        train_padded = pad_sequences(train_sequences, padding='post')

        label_tokenizer = Tokenizer()
        label_tokenizer.fit_on_texts(self.category)
        labels = [label[0] for label in label_tokenizer.texts_to_sequences(self.category)]
        self.label_len = len(label_tokenizer.word_index) + 1
        training_label_seq = np.array(labels)

        split_index = int(len(train_padded) * split_factor)
        X_train = train_padded[0:split_index]
        Y_train = training_label_seq[0:split_index]
        X_test = train_padded[split_index:]
        Y_test = training_label_seq[split_index:]

        return X_train, Y_train, X_test, Y_test
    def get_train_instances(self, train):
        users, checkins, cand_venues, labels = [], [], [], []

        for u in self.trainSeq:
            visited = self.trainSeq[u]
            checkin_ = []
            for v in visited[:-1]:
                checkin_.append(v)
                checkins.extend(sequence.pad_sequences([checkin_[:]], maxlen=self.maxVenue))

            # start from the second venue in user's checkin sequence.
            visited = visited[1:]
            for i in range(len(visited)):
                cand_venues.append(visited[i])
                users.append(u)
                labels.append(1)
                j = np.random.randint(self.uNum)
                # check if j is in training dataset or in user's sequence at state i or not
                while (u, j) in train or j in visited[:i]:
                    j = np.random.randint(self.uNum)

                cand_venues.append(j)
                users.append(u)
                labels.append(0)

        sess_number = np.ones(len(labels))

        users = np.array(users)
        items = np.array(cand_venues)
        sess_item = np.array(checkins)
        labels = np.array(labels)

        feature_dict = {'user': users, 'item': items, 'score': labels, 'sess_0_item': sess_item}

        fixlen_feature_names = get_fixlen_feature_names(self.feature_columns)
        varlen_feature_names = get_varlen_feature_names(self.feature_columns)
        x = [feature_dict[name] for name in fixlen_feature_names] + [feature_dict[name] for name in
                                                                     varlen_feature_names]
        x += [sess_number]

        return x, labels
예제 #4
0
def train():
    # zero padding
    input_vec = sequence.pad_sequences(np.load('dataset/input_vector.npy'))
    target_vec = np.load('dataset/target_vector.npy')

    input_dim = input_vec.shape[2]
    output_dim = target_vec.shape[1]
    input_sequence_length = input_vec.shape[1]

    # hyper params
    num_epochs = 1
    batch_size = 512
    units = 128

    # get model
    model = get_model(input_sequence_length, input_dim, output_dim, units)
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    model.fit(input_vec, target_vec, batch_size=batch_size, epochs=num_epochs)

    # save weights file
    weights_dir = 'model_weights/'
    if not os.path.isdir(weights_dir):
        os.mkdir(weights_dir)
    weights_file = '%sepochs_%s' % (num_epochs,
                                    time.strftime("%Y%m%d_%H_%M.h5"))
    weights_path = '%s%s' % (weights_dir, weights_file)
    model.save_weights(weights_path)

    # save model
    json_string = model.to_json()
    model_dir = 'model_json/'
    if not os.path.isdir(model_dir):
        os.mkdir(model_dir)
    model_file = '%sepochs_%s' % (num_epochs,
                                  time.strftime("%Y%m%d_%H_%M.json"))
    model_path = '%s%s' % (model_dir, model_file)
    open(model_path, 'w').write(json_string)

    print("Done!")
예제 #5
0
 def get_batch_sample(self):
     times = []
     click_times_list = []
     product_ids = []
     product_categorys = []
     advertiser_ids = []
     industrys = []
     creative_ids = []
     ages = []
     for time, click_times, product_id, \
         product_category, advertiser_id, \
         industry, creative_id, age, gender in self.get_sample(filename=self.filename):
         times.append(time)
         click_times_list.append(
             [ele if ele < 96 else 0 for ele in click_times])
         product_ids.append(product_id)
         product_categorys.append(product_category)
         advertiser_ids.append(advertiser_id)
         industrys.append(industry)
         creative_ids.append(creative_id)
         if self.is_age:
             ages.append(age)
         else:
             ages.append(gender)
         if len(times) >= self.batch_size:
             maxlen = min([max([len(ele) for ele in times]), 50])
             yield np.array([
                 pad_sequences(times, maxlen=maxlen),
                 pad_sequences(click_times_list, maxlen=maxlen),
                 pad_sequences(product_ids, maxlen=maxlen),
                 pad_sequences(product_categorys, maxlen=maxlen),
                 pad_sequences(advertiser_ids, maxlen=maxlen),
                 pad_sequences(industrys, maxlen=maxlen),
                 pad_sequences(creative_ids, maxlen=maxlen),
             ]), np.array(ages)
             times = []
             click_times_list = []
             product_ids = []
             product_categorys = []
             advertiser_ids = []
             industrys = []
             creative_ids = []
             ages = []
def tokenize_words(data: np.ndarray) -> np.ndarray:
    """[summary]
    tokenize the words, and add additional features
    Arguments:
        data {[numpy array]} -- the data
    
    Returns:
        [numpy array] -- the tokenized data, with additional features
    """
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS,
                          filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
                          lower=True)
    lines = data[:, 6]
    tokenizer.fit_on_texts(lines)
    x = tokenizer.texts_to_sequences(lines)
    x = pad_sequences(x, maxlen=MAX_SEQUENCE_LENGTH)

    additional_2 = np.array(
        [features.create_features(data[i])[0] for i in range(len(data))])
    x = np.hstack((x, additional_2))
    return x
예제 #7
0
파일: views.py 프로젝트: seokju2ng/relay_09
def sentiment_predict(new_sentence):
    file_name = os.path.dirname(__file__) + '\\best_model.h5'
    pickle_name = os.path.dirname(__file__) + '\\tokenizer.pickle'
    loaded_model = load_model(file_name)
    max_len = 30
    okt = Okt()
    tokenizer = Tokenizer()
    with open(pickle_name, 'rb') as handle:
        tokenizer = pickle.load(handle)

    stopwords = ['의', '가', '이', '은', '들', '는', '좀', '잘', '걍', '과', '도', '를', '으로', '자', '에', '와', '한', '하다']
    new_sentence = okt.morphs(new_sentence, stem=True) # 토큰화
    new_sentence = [word for word in new_sentence if not word in stopwords] # 불용어 제거
    encoded = tokenizer.texts_to_sequences([new_sentence]) # 정수 인코딩
    pad_new = pad_sequences(encoded, maxlen = max_len) # 패딩
    score = float(loaded_model.predict(pad_new)) # 예측
    if(score > 0.5):
        return (score * 100), 'positive'
        # print("{:.2f}% 확률로 긍정 리뷰입니다.\n".format(score * 100))
    else:
        return (1 - score) * 100, 'negative'
예제 #8
0
def tokenize_roberta_sent(X1: list, X2: list):
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    sentences = [
        "<s> " + X1[i] + " </s></s> " + X2[i] + " </s>" for i in range(len(X1))
    ]
    tokenized_text = [tokenizer.tokenize(sentence) for sentence in sentences]
    X = [tokenizer.convert_tokens_to_ids(sent) for sent in tokenized_text]
    #sent_emb = [[0 if i<sentence.index(102) else 1 for i  in range(len(sentence)) ] for sentence in X]
    MAX_LEN = max([len(x) for x in X]) + 1
    # Pad sequences to make them all eqally long
    X = pad_sequences(X, MAX_LEN, 'long', 'post', 'post', 1)
    #sent_emb = pad_sequences(sent_emb,MAX_LEN,'long','post','post',1)
    # Find the locations of each entity and store them
    entity_locs1 = np.asarray(
        [[i for i, s in enumerate(sent) if '<' in s and len(s) == 2]
         for sent in tokenized_text])
    entity_locs2 = np.asarray(
        [[i for i, s in enumerate(sent) if '^' in s and len(s) == 2]
         for sent in tokenized_text])

    return X, np.concatenate((entity_locs1, entity_locs2), 1)
예제 #9
0
def get_score(tweet):
    tweet = [tweet]
    reconstructed_model = keras.models.load_model(
        "../politicalClassifier/lstm_model")

    with open('../politicalClassifier/tokenizer.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)

    sequences_test = tokenizer.texts_to_sequences(tweet)
    sequences_matrix_test = sequence.pad_sequences(sequences_test, maxlen=100)

    prediction = reconstructed_model.predict(sequences_matrix_test)[0][0]
    return prediction


# test_dem_tweet = 'On September 9 — just last week — President Trump unveiled his shortlist of candidates to tap for the Supreme Court should a vacancy open up under his presidency. With Ruth Bader Ginsburg’s death Friday night, he now has that opportunity. The next Trump Supreme Court pick may well come from a list of 20 names he revealed last week, plus a longer list he had already released. The Trump White House and his allies in the Senate have spent years preparing for the next Supreme Court vacancy. Indeed, the judicial selection process may be the one professional and highly competent operation in this administration. Trump has filled the bench with fairly young, impressively credentialed ideologues who will reliably cast very conservative votes if appointed to the Supreme Court, and his Supreme Court shortlist reflects that work. Half of the names that Trump just announced are people he previously appointed to a lower court, and several more are individuals he’s appointed to non-judicial roles. And it doesn’t actually matter all that much which specific name Trump chooses from his list — or whether he ultimately decides to go off-list. Though Trump has kept his promise to only name Supreme Court justices from a prereleased list, he frequently adds new names to it. Neither of Trump’s Supreme Court appointees, Neil Gorsuch and Brett Kavanaugh, was on the original list Trump first released in 2016, but were added in subsequent iterations. What all the names on the list have in common — both old and new — is that they were vetted by Trump’s team (and often by the conservative Federalist Society) to ensure that they are reliable conservatives. That said, there is one important divide among the names on Trump’s list. Some, such as former Solicitor General Paul Clement or Fourth Circuit Judge Allison Jones Rushing, are solid conservatives who aren’t known for over-the-top, Trumpy rhetoric. Others, such as Sens. Ted Cruz (R-TX), Josh Hawley (R-MO), and Tom Cotton (R-AR) are politicians who spent their time in Congress flaunting their conservative bona fides and enraging Democrats. Still others, such as Fifth Circuit Judges James Ho and Kyle Duncan, are sitting judges who take the same trolly approach as Cruz, Hawley, and Cotton, but do so from the bench. The biggest mystery, in other words, is not what the next potential Trump nominee to the Supreme Court might believe, it’s whether Trump would pick someone with a professional demeanor — or choose a professional troll. Who is on Trump’s list? The 20-name list Trump released last week augments an existing list of 25 names that he has released gradually. Most of the names on both lists possess many of the elite credentials one would expect to find in a Supreme Court nominee. Close to half of the individuals on the new list clerked on the Supreme Court shortly after graduating from law school. And, though the lists include a few politicians like the three senators mentioned above and Kentucky Attorney General Daniel Cameron, both lists are dominated by sitting judges — including many Trump appointees. Judge Amy Coney Barrett, of the Seventh Circuit, is a prominent Trump appointee on both lists. She was favored by religious conservatives for Trump’s previous Court pick, perceived as potentially more likely to allow restrictions on abortions than Brett Kavanaugh. Because so many Trump appointees make the list, many of these judges have not served long enough to develop substantial records on the bench. But several of the names on Trump’s new list will raise deep concerns among Democrats. Judge James Ho has spent his not even three years on the United States Court of Appeals for the Fifth Circuit writing opinions that read like something published by Breitbart. His very first judicial opinion was a sweeping attack on campaign finance laws — and it included an entirely gratuitous swipe at the Affordable Care Act. Ho argued that “if you don’t like big money in politics, then you should oppose big government in our lives,” and he cited the Supreme Court’s decision largely upholding Obamacare to drive home his point. Ho has also railed against the “ moral tragedy of abortion ” in an opinion where he accused a fellow federal judge of retaliating “against people of faith for not only believing in the sanctity of life—but also for wanting to do something about it.” Ho’s Fifth Circuit colleague Kyle Duncan, meanwhile, spent much of his pre-judicial career litigating against LGBTQ rights and the right to vote. As a judge, he’s best known for an opinion where he spent more than 10 pages explaining why he insists on referring to a transgender woman using masculine pronouns. Ninth Circuit Judge Lawrence VanDyke is a particularly surprising addition to Trump’s list because VanDyke’s nomination to the federal bench was panned by the American Bar Association due to concerns that VanDyke is too lazy to do the job. “Mr. VanDyke’s accomplishments are offset by the assessments of interviewees that Mr. VanDyke is arrogant, lazy, an ideologue, and lacking in knowledge of the day-to-day practice including procedural rules,” the ABA explained in a scathing letter deeming him unqualified for the federal bench. The ABA’s investigation found that VanDyke “lacks humility, has an ‘entitlement’ temperament, does not have an open mind, and does not always have a commitment to being candid and truthful.” It’s unclear why Trump loyalists would want to see someone appointed to the Supreme Court who may lack the temperament and the work ethic to do the job well. That said, VanDyke is an outlier on Trump’s list. For the most part, the nearly four dozen names Trump has suggested as possible Supreme Court nominees are diligent and highly talented lawyers. They just also happen to be lawyers who are eager to move the law sharply to the right. The White House’s judicial selection process is the most professional operation in the Trump administrationn. To his many critics, “Donald Trump” is a name practically synonymous with goonish incompetence. But Trump’s judicial selection operation is nothing like that. It is both efficient and highly effective in identifying reliable conservative ideologues with sterling legal resumes. In less than four years as president, Trump has appointed 201 lawyers to lifetime appointments on the federal bench, including 53 to powerful seats on the United States Courts of Appeal. By contrast, President Obama appointed only 55 appellate judges during his eight years as president. One reason for this disparity is that Senate Republicans, led by McConnell, imposed a near-total blockade on appeals court confirmations during Obama’s final two years in the White House. That meant that Trump has effectively been able to fill all the appeals court vacancies that arose during his presidency, plus nearly all the vacancies that should have been filled in Obama’s last two years in office. Trump’s judges, moreover, are quite young. “The average age of circuit judges appointed by President Trump is less than 50 years old,” the Trump White House bragged in November of 2019, “a full 10 years younger than the average age of President Obama’s circuit nominees.” And a large percentage of them have amassed impressive credentials such as Supreme Court clerkships and other government jobs of great influence. All of this is a reason for liberals to be more afraid of Trump’s judges — and potential justices — than if Trump were picking undistinguished hacks to fill the bench. Judges of great ability are far more likely to find innovative ways to reshape the law than incompetents and mediocrities. Moreover, Trump is filling the bench with some of the Federalist Society’s brightest minds at the very moment when the judiciary is gaining power relative to the other branches. As I wrote several months ago in a piece laying out Trump’s impact on the bench: In an age of legislative dysfunction, whoever controls the courts controls the country. In the past decade or so — or more precisely, since Republicans took over the House in 2011 — Congress has been barely functional. You can count on one hand — and possibly on just a few fingers — the major legislation it has enacted. Judges, by contrast, have become the most consequential policymakers in the nation. They have gutted America’s campaign finance law and dismantled much of the Voting Rights Act. They have allowed states to deny health coverage to millions of Americans. They’ve held that religion can be wielded as a sword to cut away the rights of others. They’ve drastically watered down the federal ban on sexual harassment. And that barely scratches the surface. If Trump gets to replace a liberal justice, this practice of judicial policymaking will only accelerate. Environmental regulations are likely to be dismantled en masse. Voting rights will be hollowed out even more. Obamacare could be struck down. And, perhaps most significantly, purely partisan Republican arguments will gain even more purchase in the Supreme Court. Anyone Trump names to the Supreme Court, if Trump’s allowed to do so, is likely to push the law relentlessly to the right. Help keep Vox free for all Millions turn to Vox each month to understand what’s happening in the news, from the coronavirus crisis to a racial reckoning to what is, quite possibly, the most consequential presidential election of our lifetimes. Our mission has never been more vital than it is in this moment: to empower you through understanding. But our distinctive brand of explanatory journalism takes resources — particularly during a pandemic and an economic downturn. Even when the economy and the news advertising market recovers, your support will be a critical part of sustaining our resource-intensive work, and helping everyone make sense of an increasingly chaotic world. Contribute today from as little as $3.'
# test_rep_tweet = 'President Donald Trump stepped up his rhetoric Thursday on cultural issues, aiming to boost enthusiasm among rural Wisconsin voters as he tries to repeat his path to victory four years ago. Making his fifth visit to the pivotal battleground state this year, Trump views success in the state’s less-populated counties as critical to another term. He held a rally Thursday evening in Mosinee, in central Wisconsin, an area of the state that shifted dramatically toward Republicans in 2016, enabling Trump to overcome even greater deficits in urban and suburban parts of the state. Trump has increasingly used his public appearances to elevate cultural issues important to his generally whiter and older base, as he hinges his campaign on turning out his core supporters rather than focusing on winning over a narrow slice of undecided voters. In Mosinee, he called for a statute to ban burning the American flag in protest — a freedom protected by the Supreme Court — and criticized sports players and leagues for allowing demonstrations against racial inequality. “We have enough politics, right," he said, joking that sometimes, “I can’t watch me.” He added of protests in sports, “People don’t want to see it and the ratings are down.”'
# test_dem2_tweet = '"When he nominated her in 1993, Bill Clinton called her “the Thurgood Marshall of gender-equality law”, comparing her advocacy and lower-court rulings in pursuit of equal rights for women to the work of the great jurist who advanced the cause of equal rights for Black people. Ginsburg persuaded the supreme court that the 14th amendment’s guarantee of equal protection applied not only to racial discrimination but to sex discrimination as well. For Ginsburg, principle was everything – not only equal rights, but also the integrity of democracy. Always concerned about the consequences of her actions for the system as a whole, she advised young people “to fight for the things you care about but do it in a way that will lead others to join you”."'
# test_label = 0

#print(get_score(test_dem2_tweet))
def create_sequences(tokenizer, max_length, descriptions, photos, vocab_size):
    X1, X2, y = list(), list(), list()
    # walk through each image identifier
    for key, desc_list in descriptions.items():
        # walk through each description for the image
        for desc in desc_list:
            # encode the sequence
            seq = tokenizer.texts_to_sequences([desc])[0]
            # split one sequence into multiple X,y pairs
            for i in range(1, len(seq)):
                # split into input and output pair
                in_seq, out_seq = seq[:i], seq[i]
                # pad input sequence
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                # encode output sequence
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                # store
                X1.append(photos[key][0])
                X2.append(in_seq)
                y.append(out_seq)
    return array(X1), array(X2), array(y)
예제 #11
0
파일: tps.py 프로젝트: amnir/ltrt
def predict():
    our_tweets_vects = []
    tweet = request.json["tweet"]
    tweet = clean_tweet(tweet).split(' ')
    vector = [dict.token2id[word] for word in tweet if word in dict.values()]
    our_tweets_vects.append(vector)
    our_tweets_np = np.array(our_tweets_vects)
    our_tweets_vects = sequence.pad_sequences(our_tweets_np, maxlen=150)

    with graph.as_default():
        our = model.predict_classes(our_tweets_vects)

    if len(our) != 1:
        raise Exception("Failed to preict")

    ans = {}
    if our[0] == 0:
        ans = {"affiliation": "left"}
    elif our[0] == 1:
        ans = {"affiliation": "right"}
    return json.dumps(ans)
 def process_X(self,
               data,
               word2idx,
               max_sentence_length,
               data_as_sentences=False):
     if data_as_sentences:
         X = [[
             word2idx[w] if w in word2idx.keys() else word2idx['UNK']
             for w in s
         ] for s in data]
     else:
         sentences = SentenceGetter(data, label_adapter=get_label).sentences
         X = [[
             word2idx[w[0]] if w[0] in word2idx.keys() else word2idx['UNK']
             for w in s
         ] for s in sentences]
     X = pad_sequences(maxlen=max_sentence_length,
                       sequences=X,
                       padding="post",
                       value=word2idx["PAD"])
     return np.array(X)
def bag_of_words(in_array, tokenizer):
    """
    Converts turns an array of reviews into a bag of words model, each word is represented by an integer
    :param in_array: Array of all of the reviews needed to be converted in to the bag of words model
    :param tokenizer:  A object to convert each word to an integer
    :return: a list of all the reviews padded to the correct length to be used for input into the model
    """
    word_list = []
    for word in in_array:
        word_list.append(word)
    tokenizer.fit_on_texts(word_list)
    tokened_reviews = []
    for review in in_array:
        sequ = tokenizer.texts_to_sequences(review)
        filtered_sequ = [x for x in sequ if x != []]
        to_pad = []
        for l in filtered_sequ:
            to_pad.append(l[0])
        tokened_reviews.append(pad_sequences([to_pad], PADDED_LENGTH))
    tokened_reviews_np = np.vstack(tokened_reviews)
    return tokened_reviews_np
예제 #14
0
 def transform(self,x=None):
     vocab_list = list(self.wordModel.wv.vocab.keys())
     word_index = {word: index for index, word in enumerate(vocab_list)}
     seq = []
     all_text_seq = []
     num = 1
     for line in self.datas:
         words = line.split(" ")
         for index in range(len(words) - 1):
             if num < self.sequence_length:
                 if words[index] in word_index and words[index] != words[index + 1]:
                     seq.append(word_index[words[index]] + 1)
                     num += 1
                 else:
                     pass
         all_text_seq.append(seq)
         seq = []
         num = 1
     all_text_test = pad_sequences(all_text_seq, maxlen=self.sequence_length, padding='post')
     # print(all_text_test)
     return all_text_test
예제 #15
0
    def generate_test_data(self, df, isValidate=True):
        sessions, items, seq, labels, itemIds, positions, prices = [], [], [], [], [], [], []
        for city, rows in df.groupby("city"):
            for idx, row in rows.iterrows():
                impressions = [self.item_index[int(i)] for i in row['impressions'].split("|")]
                price = [int(i) for i in row['prices'].split("|")]
                interactions = [self.item_index[int(i)] for i in row['interactions'].split("|")] if type(
                    row['interactions']) == str else []

                if isValidate:
                    try:
                        gtItem = self.item_index[int(row['reference'])]
                        tmp = np.zeros(len(impressions))
                        tmp[impressions.index(gtItem)] = 1
                    except Exception as e:
                        continue
                    labels.extend(tmp)
                    sessions.extend([idx] * len(impressions))
                else:
                    sessions.extend([row['session_id']] * len(impressions))
                    itemIds.extend(row['impressions'].split("|"))
                items.extend(impressions)
                # positions.extend([i + 1 for i in range(len(impressions))])
                # prices.extend(price)
                seq.extend([interactions for i in range(len(impressions))])

        items = np.array(items)
        items = items.reshape(len(items), 1)
        seq = pad_sequences(seq, maxlen=self.maxlen)
        sessions = np.array(sessions)
        labels = np.array(labels)
        # prices = np.array(prices).reshape(len(items), 1)
        # positions = np.array(positions).reshape(len(items), 1)

        # if isValidate:
        #     return sessions, items, seq, positions, prices, labels
        # return sessions, itemIds, items, seq, positions, prices
        if isValidate:
            return sessions, [items, seq], labels
        return sessions, itemIds, [items, seq]
def build_vectors(train_data_x, train_data_y, labels2idx, tokenisation):

    low = tokenisation['low']
    simple = tokenisation['simple']
    stop = tokenisation['stop']

    token2idx, max_sent_len, _ = build_token_index(train_data_x,
                                                   lowercase=low,
                                                   simple=simple,
                                                   remove_stopwords=stop)

    print("token2idx: ", len(token2idx))

    # y_data: encode into one-hot vectors with all labels in the hierarchy
    train_y = []
    for y_sample in train_data_y:
        all_labels = np.zeros(len(labels2idx))
        for labels in y_sample:
            for level, label in labels.items():
                all_labels[labels2idx[label]] = 1
        train_y.append(all_labels)

    # vectorize, i.e. tokens to indexes and pad
    print("Vectorizing input data\n")
    vectors = []
    for x in train_data_x:
        text = x['title'] + " SEP " + x['body']
        tokens = tokenise(text,
                          lowercase=low,
                          simple=simple,
                          remove_stopwords=stop)
        vector = vectorizer(tokens, token2idx)
        vectors.append(vector)
    vectors_padded = pad_sequences(vectors,
                                   padding='post',
                                   maxlen=max_sent_len,
                                   truncating='post',
                                   value=token2idx['PADDED'])

    return vectors_padded, np.array(train_y), token2idx, max_sent_len
예제 #17
0
def tokenize(tokenizer, sentences, next_sentences, MAX_LEN):

    input_ids = []
    attention_masks = []

    print("encoding...")
    for i, (sent, next_sent) in enumerate(zip(sentences, next_sentences)):
        if i % 1000 == 0: print("encoding " + str(i) + "th sentence")
        _encoded = tokenizer.encode_plus(
            sent,
            text_pair=next_sent,
            add_special_tokens=True,
            max_length=MAX_LEN,
            #  pad_to_max_length=True,
            return_tensors='pt',
        )
        #  pdb.set_trace()
        #  input_ids = _encoded['input_ids'][0].numpy()
        encoded_sent = _encoded['input_ids'][0]
        #  attention_mask = _encoded['special_tokens_mask']

        #  encoded_sent -> ['special_tokens_mask'], ['input_ids'], ['token_type_ids']
        input_ids.append(encoded_sent)
        #  attention_masks.append(attention_mask)

    input_ids = pad_sequences(input_ids,
                              maxlen=MAX_LEN,
                              dtype="long",
                              value=0,
                              truncating="post",
                              padding="post")
    #  attention_mask = _encoded['special_tokens_mask']

    for sent in input_ids:
        att_mask = [int(token_id > 0) for token_id in sent]
        attention_masks.append(att_mask)

    #  pdb.set_trace()

    return input_ids, attention_masks
예제 #18
0
    def train_word2vec_model(self,
                             sentences_with_spilt_words: Iterable[
                                 Iterable[str]],
                             sentence_len: int = 5) -> Word2Vec:
        self._embedding_model: Word2Vec = Word2Vec(sentences_with_spilt_words,
                                                   min_count=self.MIN_COUNT,
                                                   size=self.EMBEDDING_SIZE,
                                                   window=10)

        if os.path.exists(self.EMBEDDING_MODEL_SAVED_PATH):
            os.remove(self.EMBEDDING_MODEL_SAVED_PATH)

        self._embedding_model.save(self.EMBEDDING_MODEL_SAVED_PATH)

        self.tokenizer: Tokenizer = Tokenizer(
            num_words=10000,
            oov_token="NaN",
            filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
        self.tokenizer.fit_on_texts(sentences_with_spilt_words)

        sentences_with_split_words_sequence = self.tokenizer.texts_to_sequences(
            sentences_with_spilt_words)
        self.sentences_with_split_words_sequence = pad_sequences(
            sentences_with_split_words_sequence,
            maxlen=sentence_len,
            padding="post",
            truncating="post")

        self.word_index = self.tokenizer.word_index

        self.embedding_matrix = np.zeros(
            (len(self.word_index) + 1, self.EMBEDDING_SIZE))

        for word, index in self.word_index.items():
            try:
                self.embedding_matrix[index] = self._embedding_model[word]
            except Exception:
                continue

        return self._embedding_model
예제 #19
0
def gen_dataset(dataset_path, k_folds, treat_F_as_deceptive):
    nltk.download('wordnet')
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    generator = RawData()
    processor = ReviewsPreprocessor(text, stop_words, wordnet)
    vocab = Tokenizer(num_words=Constants.MAX_FEATURES)

    df = generator.generate(dataset_path, treat_F_as_deceptive)
    reviews = list(processor.process(df[Constants.Cols.REVIEW]))
    vocab.fit_on_texts(reviews)
    encoded_reviews = vocab.texts_to_sequences(reviews)
    df[Constants.Cols.REVIEW] = list(
        pad_sequences(encoded_reviews, maxlen=Constants.MAX_LEN))

    k_fold = StratifiedKFold(n_splits=k_folds,
                             shuffle=True,
                             random_state=Constants.SEED)

    for train_idx, test_idx in k_fold.split(df, df[Constants.Cols.LABEL]):
        yield Dataset(df.iloc[train_idx], df.iloc[test_idx], vocab,
                      treat_F_as_deceptive)
예제 #20
0
def create_inference_dataset(sentences: Series, tokenizer: MyTokenizer):

    sentences_list = []
    length_samples = len(sentences)

    for i in range(length_samples):
        sentence = sentences.iloc[i]
        tokenized_sentence = []
        for word in sentence.split(sep=' '):
            if word != '':
                token = tokenizer.word_to_index(word)
                token = tokenizer.word_to_index(
                    'UNK') if token is None else token
                tokenized_sentence.append(token)
        sentences_list.append(tokenized_sentence)

    dataset = pad_sequences(sequences=sentences_list,
                            maxlen=MAX_WORD_SENTENCE,
                            padding='post',
                            value=tokenizer.word_to_index('PAD'))

    return dataset
예제 #21
0
def tokenlize_text(max_num_words, max_seq_length, x_train):
    """Tokenlize text.

    Vectorize a text corpus by transform each text in texts to a sequence of integers.

    Args:
        max_num_words: Int, max number of words in the dictionary.
        max_seq_length: Int, the length of each text sequence, padding if shorter, trim is longer.
        x_train: List contains text data.

    Returns:
        x_train: Tokenlized input data.
        word_index: Dictionary contains word with tokenlized index.
    """
    print("tokenlizing texts...")
    tokenizer = Tokenizer(num_words=max_num_words)
    tokenizer.fit_on_texts(x_train)
    sequences = tokenizer.texts_to_sequences(x_train)
    word_index = tokenizer.word_index
    x_train = pad_sequences(sequences, maxlen=max_seq_length)
    print("data readed and convert to %d length sequences" % max_seq_length)
    return x_train, word_index
예제 #22
0
def vectorize_dev_data(dev_data_x, max_sent_len, token2idx, tokenisation):
    print("Vectorizing dev data\n")
    vectors = []
    low = tokenisation['low']
    simple = tokenisation['simple']
    stop = tokenisation['stop']

    for x in dev_data_x:
        text = x['title'] + " SEP " + x['body']
        tokens = tokenise(text,
                          lowercase=low,
                          simple=simple,
                          remove_stopwords=stop)
        vector = vectorizer(tokens, token2idx)
        vectors.append(vector)

    test_vectors = pad_sequences(vectors,
                                 padding='post',
                                 maxlen=max_sent_len,
                                 truncating='post',
                                 value=token2idx['PADDED'])
    return test_vectors
예제 #23
0
def prueba_1():
	docs = ['Well done!',
			'Good work',
			'Great effort',
			'nice work',
			'Excellent!',
			'Weak',
			'Poor effort!',
			'not good',
			'poor work',
			'Could have done better.']
	# define class labels
	labels = np.array([1,1,1,1,1,0,0,0,0,0])

	# integer encode the documents
	vocab_size = 50
	encoded_docs = [one_hot(d, vocab_size) for d in docs]
	print(encoded_docs)

	# pad documents to a max length of 4 words
	max_length = 4
	padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
	print(padded_docs)

	# define the model
	model = Sequential()
	model.add(Embedding(vocab_size, 8, input_length=max_length))
	model.add(Flatten())
	model.add(Dense(1, activation='sigmoid'))
	# compile the model
	model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
	# summarize the model
	print(model.summary())

	# fit the model
	model.fit(padded_docs, labels, epochs=50, verbose=0)
	# evaluate the model
	loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
	print('Accuracy: %f' % (accuracy*100))
예제 #24
0
파일: pre.py 프로젝트: cjcjagt/malware_api
def shuffleData(datas, labels, tokenflag=False):
    all_text_seq = []
    array = []
    if (tokenflag):
        sentences = MySentences('data/')  # a memory-friendly iterator
        model = gensim.models.Word2Vec(sentences, min_count=0)
        model.save('myWord2VecModel.model')

    gensim_model = gensim.models.Word2Vec.load('myWord2VecModel.model')
    print(len(gensim_model.wv.vocab))
    # for word in gensim_model.wv.vocab:
    # 	print(word,gensim_model[word])
    # print(datas)
    for line in datas:
        # array = [gensim_model[word] for word in line.split(" ") if word in gensim_model]
        all_text_seq.append([gensim_model[word] for word in line.split(" ") if word in gensim_model].mean(axis=0))
    print(len(all_text_seq[0][0]))
    # print(type(line))
    # print(line)
    # for word in line.split(" "):
    # print(word)
    all_text_test = pad_sequences(all_text_seq, maxlen=sequence_length, padding='post', value=0)

    label_one_hot = np.zeros((len(labels), class_num), dtype=np.int)
    for i in range(len(labels)):
        label_one_hot[i][labels[i]] = 1
    np.random.seed(100)
    del all_text_seq
    shuffle_indices = np.random.permutation(np.arange(len(labels)))
    x_shuffled = np.array(all_text_test)[shuffle_indices.astype(int)]
    y_shuffled = np.array(labels)[shuffle_indices.astype(int)]

    # Split train/test set
    dev_sample_index = -1 * int(dev_sample_percentage * len(labels))
    x_train, x_val = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
    y_train, y_val = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
    del datas, labels, x_shuffled, y_shuffled, all_text_test
    return x_train, x_val, y_train, y_val
예제 #25
0
def create_dataset(sentences: Series, targets: Series, tokenizer: MyTokenizer):

    assert len(sentences) == len(
        targets
    ), 'Error - create_dataset - sentence and target series have different length'

    length_samples = len(sentences)

    x_sentence, y_sentence = [], []

    for i in range(length_samples):
        sentence = sentences.iloc[i]
        target = targets.iloc[i]

        x_sentence.append([
            tokenizer.word_to_index(word) for word in sentence.split(sep=' ')
            if word != ''
        ])
        y_sentence.append(
            [tokenizer.label_to_index(t) for t in str(target).split(sep=' ')])

    x_dataset = pad_sequences(sequences=x_sentence,
                              maxlen=MAX_WORD_SENTENCE,
                              padding='post',
                              value=tokenizer.word_to_index('PAD'))

    num_classes = tokenizer.n_labels
    y_dataset = []
    for targets in y_sentence:
        cat_target = np.zeros(num_classes)
        for target in targets:
            cat_target += to_categorical(target, num_classes=num_classes)

        y_dataset.append(cat_target)

    y_dataset = np.array(y_dataset)

    return x_dataset, y_dataset
예제 #26
0
def viewGradCam(model):
    image = img_to_array(
        load_img("./images/CXR2484_IM-1012-1001.png",
                 color_mode="rgb",
                 target_size=(256, 256))) / 255.

    image3 = img_to_array(
        load_img("./images/CXR2484_IM-1012-1001.png",
                 color_mode="rgb",
                 target_size=(256, 256))) / 255.

    image2 = np.expand_dims(image, axis=0)

    report = "COMPARISON None.  INDICATION Dizzy. Unable to XXXX.  FINDINGS Frontal and lateral views of the chest with overlying external cardiac monitor leads show normal size and configuration of the cardiac silhouette. There are scattered nodular opacities, XXXX calcified granulomas. No XXXX focal airspace consolidation or pleural effusion.  IMPRESSION No acute or active cardiac, pulmonary or pleural disease. Probable previous granulomatous disease. "

    print("Load tokenizer")
    with open('tokenizer.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)
    x1_train = tokenizer.texts_to_sequences([report])
    x1_train = pad_sequences(x1_train, maxlen=MAX_WORDS_TEXT, padding='post')

    #layer= Choose the one you want

    predicted = model.predict([image2, x1_train])
    print(predicted)
    predicted[predicted >= 0.5] = 1
    predicted[predicted < 0.5] = 0
    print(predicted)
    predicted = np.array(predicted)
    indexes = np.where(predicted[0] == 1)[0]
    print(indexes)
    plt.imshow(image3)
    plt.show()
    for index in indexes:
        grads = visualize_cam(model, layer, index, image2)
        plt.imshow(image3)
        plt.imshow(grads, cmap='jet', alpha=0.4)
        plt.show()
예제 #27
0
    def update_sentence_length(self, domain_config, data_dict):
        """
        Pad or trim all sentences to the correct length. In the future
        this number may be evolved and included in global_hyperparameters.

        :param domain_config: The configuration dictionary for domain evaluation
        :param data_dict: the dictionary containing domain keys for each data
                    set used.
        :return: Nothing
        """

        if data_dict is None:
            # We have no tokens to pad.
            # This is the NetworkVisualizer's code path.
            return

        tokens = data_dict['tokens']
        info = domain_config.get('info', {})
        max_sentence_length = info.get("max_sentence_length")
        for split in tokens:
            tokens[split] = pad_sequences(tokens[split],
                                          maxlen=max_sentence_length,
                                          value=0)
예제 #28
0
def tokenize_dataset(x, y):

    tokenizer = CustomTokenizer()
    tokenizer.fit(x, y)

    length_samples = len(x)

    x_sentence, y_sentence = [], []

    for i in range(length_samples):
        sentence = x[i]

        x_sentence.append([
            tokenizer.word_to_index(word) for word in sentence.split()
            if word != ''
        ])

    x_dataset = pad_sequences(sequences=x_sentence,
                              maxlen=MAX_WORD_SENTENCE,
                              padding='post',
                              value=tokenizer.word_to_index('PAD'))

    return x_dataset, y, tokenizer
예제 #29
0
def process_result(response, model):
    titles = list(map(lambda row: row[5], response))
    x = tokenizer.texts_to_sequences(titles)
    x = sequence.pad_sequences(x,
                               maxlen=length,
                               truncating='post',
                               padding='pre')
    tensor = torch.tensor(x, dtype=torch.long)
    x = model.forward(tensor)
    logits_list = x.tolist()
    softmax_list = F.softmax(x, dim=1).tolist()
    classes = torch.max(x, dim=1)[1].tolist()
    return [{
        'foundBy': row[0],
        'entity': row[2] if row[1] is None else row[1],
        'stockLabel': row[4] if row[3] is None else row[3],
        'title': row[5],
        'text': row[6],
        'class': class_,
        'softmax': softmax,
        'logits': logits,
    } for row, logits, class_, softmax in zip(response, logits_list, classes,
                                              softmax_list)]
예제 #30
0
def prep_data(image_data, caption_data, cap_max_len):
    n = len(image_data)
    print("Number of images: %d" % n)
    assert n == len(
        caption_data), "Number of images <--> captions should be equal."

    images, captions, target_caps = [], [], []
    print("Max caption length: %d" % cap_max_len)
    for img, txt in zip(image_data, caption_data):
        for i in range(1, len(txt)):
            in_txt = txt[:i]
            out_txt = txt[i]
            in_txt = torch.from_numpy(
                pad_sequences([in_txt], maxlen=cap_max_len).flatten())
            images.append(img.unsqueeze(0))
            captions.append(in_txt.unsqueeze(0))
            target_caps.append(torch.LongTensor([out_txt]))

    images = torch.cat(images)
    captions = torch.cat(captions).long()
    target_caps = torch.cat(target_caps)
    # print(images.dtype, captions.dtype, target_caps)
    return [images, captions, target_caps]
예제 #31
0
def tokenize_plus_attention(df, MAX_LEN=150):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                              do_lower_case=True)
    tokenized_texts = [tokenizer.tokenize(sent) for sent in df]

    # Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

    # Pad our input tokens
    input_ids = pad_sequences(input_ids,
                              maxlen=MAX_LEN,
                              dtype="long",
                              truncating="post",
                              padding="post")

    # Create attention masks
    attention_masks = []

    for seq in input_ids:
        seq_mask = [float(i > 0) for i in seq]
        attention_masks.append(seq_mask)

    return [input_ids, attention_masks]
예제 #32
0
def tokenlize_text(max_num_words, max_seq_length, x_train):
    """Tokenlize text.

    Vectorize a text corpus by transform each text in texts to a sequence of integers.

    Args:
        max_num_words: Int, max number of words in the dictionary.
        max_seq_length: Int, the length of each text sequence, padding if shorter, trim is longer.
        x_train: List contains text data.

    Returns:
        x_train: Tokenlized input data.
        word_index: Dictionary contains word with tokenlized index.
    """
    from keras_preprocessing.sequence import pad_sequences
    from keras_preprocessing.text import Tokenizer
    print("tokenlizing texts...")
    tokenizer = Tokenizer(num_words=max_num_words)
    tokenizer.fit_on_texts(x_train)
    sequences = tokenizer.texts_to_sequences(x_train)
    word_index = tokenizer.word_index
    x_train = pad_sequences(sequences, maxlen=max_seq_length)
    print("data readed and convert to %d length sequences" % max_seq_length)
    return x_train, word_index
def test_pad_sequences_str():
    a = [['1'], ['1', '2'], ['1', '2', '3']]

    # test padding
    b = sequence.pad_sequences(a, maxlen=3, padding='pre', value='pad', dtype=object)
    assert_equal(b, [['pad', 'pad', '1'], ['pad', '1', '2'], ['1', '2', '3']])
    b = sequence.pad_sequences(a, maxlen=3, padding='post', value='pad', dtype='<U3')
    assert_equal(b, [['1', 'pad', 'pad'], ['1', '2', 'pad'], ['1', '2', '3']])

    # test truncating
    b = sequence.pad_sequences(a, maxlen=2, truncating='pre', value='pad',
                               dtype=object)
    assert_equal(b, [['pad', '1'], ['1', '2'], ['2', '3']])
    b = sequence.pad_sequences(a, maxlen=2, truncating='post', value='pad',
                               dtype='<U3')
    assert_equal(b, [['pad', '1'], ['1', '2'], ['1', '2']])

    with pytest.raises(ValueError, match="`dtype` int32 is not compatible with "):
        sequence.pad_sequences(a, maxlen=2, truncating='post', value='pad')