Python preprocess 예제들, pre_process.preprocess Python 예제들

예제 #1

0

파일 보기

파일: classifier_import_and_predict.py 프로젝트: amartya-dev/spam-filter

def predict(message):
    # file containing the classifier
    filename = 'finalized_model.sav'
    loaded_model = joblib.load(filename)

    # message entered, to be pre processed
    message = preprocess(message)
    y = get_features(message)
    y = y.toarray()

    #Knowing the number of features in the present converted /pre-processed data
    tup = y.shape
    k = tup[1] - 1

    # While loop to make the number of features as SVM and adabooster can only work
    #after association

    if (k > 56):
        while (k != 56):
            y = np.delete(y, k, 1)
            k -= 1
    else:
        return ("Text too small to obtain features therefore not spam")

    result_array = loaded_model.predict(y)
    if (result_array[-1] == 0):
        return ("Not Spam")
    else:
        return ("Spam")

예제 #2

0

파일 보기

def main():
    from pre_process import preprocess
    feature, a_hat, labels = preprocess()
    print("loaded")

    selected, unselected = depart(len(labels), 1 - Config.test_ratio)
    labels_selected = labels[selected]
    labels_unselected = labels[unselected]

    feature = torch.from_numpy(feature).float().cuda()
    tensor_selected = torch.tensor(labels_selected).long().cuda()
    a_hat = torch.tensor(a_hat).float().cuda()
    net = GCN(a_hat, feature.shape[1], Config.num_classes, Config.hidden_size,
              Config.n_hidden_layer).cuda()

    print(net)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(net.parameters(), lr=Config.lr)
    net.train()
    for e in range(Config.num_epochs):
        optimizer.zero_grad()
        output = net(feature)
        loss = criterion(output[selected], tensor_selected)
        loss.backward()
        optimizer.step()

        trained_accuracy = evaluate(output[selected], labels_selected)
        untrained_accuracy = evaluate(output[unselected], labels_unselected)
        print(
            "[Epoch %d]: trained acc: %.7f, untrained acc: %.7f, loss: %.7f" %
            (e, trained_accuracy, untrained_accuracy,
             loss.detach().cpu().numpy()))

예제 #3

0

파일 보기

    def tokenize(self, text):
        """Tokenizes a piece of text."""
        text = convert_to_unicode(text)
        text = preprocess(text, True)
        text = self._clean_text(text)

        # This was added on November 1st, 2018 for the multilingual and Chinese
        # models. This is also applied to the English models now, but it doesn't
        # matter since the English models were not trained on any Chinese data
        # and generally don't have any Chinese data in them (there are Chinese
        # characters in the vocabulary because Wikipedia does have some Chinese
        # words in the English Wikipedia.).
        text = self._tokenize_chinese_chars(text)

        orig_tokens = whitespace_tokenize(text)
        split_tokens = []
        for token in orig_tokens:
            if self.do_lower_case:
                token = token.lower()
                token = self._run_strip_accents(token)
            split_tokens.extend(self._run_split_on_punc(token))

        output_tokens = whitespace_tokenize(" ".join(split_tokens))
        return output_tokens

예제 #4

0

파일 보기

파일: network.py 프로젝트: zhouyingchaoAI/TensorRT-Inference-Server-Tutorial

        "dims": [1, 1, 128, 128],
        "data_type": "TYPE_FP32"
    }, {
        "name": "bbox_wh",
        "dims": [1, 2, 128, 128],
        "data_type": "TYPE_FP32"
    }, {
        "name": "center_shift",
        "dims": [1, 2, 128, 128],
        "data_type": "TYPE_FP32"
    }]

    from pre_process import preprocess
    import tensorflow as tf
    sess = tf.Session()
    preprocess_fn = lambda x: sess.run(preprocess([x]))[0]

    stream = trt_backend.ImageBatchStream("./calibrator_files", 5,
                                          preprocess_fn)
    int8_calibrator = trt_backend.IInt8EntropyCalibrator2(inputs_def, stream)

    trt_backend.torch2trt(computation_graph=model,
                          graph_name="detection-network",
                          model_file="./network/dla34.pth",
                          inputs_def=inputs_def,
                          outputs_def=outputs_def,
                          instances=16,
                          gpus=[0, 1, 2, 3],
                          version=1,
                          export_path="../../model_repository",
                          int8_calibrator=int8_calibrator)

예제 #5

0

파일 보기

def generate_term_list(filename, term_filter):
    '''
    Generate a list of all the terms that appear in a .json file

    **Parameters**

        filename: *str*
            The name of the .json file to be input.
        term_filter: *str*
            The type of filter to be used for parsing through all the words in
            the Tweets. There are six different filters that can be used:
                default - considers all of the terms
                remove_stop_words - does not consider stop-words
                hashtags - only considers hashtags and no other terms
                terms_only - does not consider hashtags or mentions
                single_terms - only counts terms once in a Tweet
                single_stop_words - only counts terms once and does not
                                    consider stop-words

    **Returns**

        terms: *list, str*
            A list with all of the individual terms that appear in the .json
            file.
    '''
    # Define list of common characters used for punctuation
    punctuation = list(string.punctuation)
    # Define list of stop-words, which are common words that do not carry
    # significance (conjunctions, adverbs, etc.)
    stop = stopwords.words('english') + punctuation + \
        ["rt", "via", "…", "’", "“", "”", "‘", "1",
            "2", "3", "4", "5", "6", "7", "8", "9", "0"]
    terms = []
    # Open the file
    with open(filename, 'r') as f:
        # Parse through each line/Tweet in the .json file
        for line in f:
            tweet = json.loads(line)
            # Pre-process the information in the Tweet
            ppterms = preprocess(tweet['text'])
            # Apply the appropriate filter as specified by the user
            if term_filter == "remove_stop_words":
                terms.append([term for term in ppterms if term not in stop])
            elif term_filter == "hashtags":
                terms.append(
                    [term for term in ppterms if term.startswith('#')])
            elif term_filter == "terms_only":
                terms.append([
                    term for term in ppterms
                    if term not in stop and not term.startswith(('#', '@'))
                ])
            elif term_filter == "single_terms":
                temp = [term for term in ppterms]
                terms.append(list(set(temp)))
            elif term_filter == "single_stop_words":
                temp = [term for term in ppterms if term not in stop]
                terms.append(list(set(temp)))
            elif term_filter == "default" or term_filter is None:
                terms.append([term for term in ppterms])
            else:
                raise Exception("Invalid filter type.")
    return terms

예제 #6

0

파일 보기

파일: TrussAnalyzer.py 프로젝트: yuebulv/TrussAnalyzer2018

 def OnStart(self,event):    #主程序启动
     pp.preprocess()
     frame = Result(parent=None, id=-1)
     frame.Show()

예제 #7

0

파일 보기

parser.add_argument("-p2",
                    "--population2",
                    type=int,
                    help="population of the GA for the full-route")
parser.add_argument("-m2",
                    "--mutationrate2",
                    type=int,
                    help="mutation rate of the GA for the full-route")
parser.add_argument("-g2",
                    "--generation2",
                    type=int,
                    help="generation to run for the GA for the full-route")

args = parser.parse_args()

nodearray = preprocess(args.filename)

p1 = 30
m1 = 0.1
g1 = 20

n = int(np.sqrt(len(nodearray) / 2.72015))

p2 = 200
m2 = 0.5
g2 = 20

if args.population1:
    p1 = args.population1
if args.mutationrate1:
    m1 = args.mutationrate1