Пример #1
0
def load_queries(queries_filename):
    file = open(queries_filename, 'r')
    queries = {}

    vocab = Vocab()
    tokenizer = Tokenizer()
    lemmatizer = Lemmatizer()

    for l in file.readlines():
        l = l.replace('\n', '')
        l_arr = l.split('\t')
        q = Query()

        q.id = int(l_arr[0])
        q_text = l_arr[1]
        q_syn_text = ''
        if len(l_arr) > 2:
            q_syn_text = l_arr[2]

        q.text = q_text + ' ' + q_syn_text

        q.tokens = lemmatizer.fit_transform(tokenizer.fit_transform(q_text))
        q.synonim_tokens = lemmatizer.fit_transform(tokenizer.fit_transform(q_syn_text))
        queries[q.id] = q

    file.close()

    # create vocab
    for q_id in queries.keys():
        q = queries[q_id]

        tokens = q.tokens + q.synonim_tokens

        vocab.add_phrase(tuple(q.tokens))

        for tkn in tokens:
            vocab.add1(tkn)

        grams, inv_grams, gap_grams = get_ngrams(tokens, 2, inverted=True, with_gap=True)
        for g in grams + inv_grams + gap_grams:
            vocab.add2(g)

    return queries, vocab
Пример #2
0
    def lemmatize(self, stop_words=None):
        tokenizer = Tokenizer(stop_words=stop_words)
        lemmatizer = Lemmatizer(stop_words=stop_words)

        self.lemmatized_queries = dict()
        for q_id in self.queries.dict.keys():
            q = self.queries.get(q_id)

            tok_q = tokenizer.fit_transform(q)
            lem_q = lemmatizer.fit_transform(tok_q)
            self.lemmatized_queries[int(q_id)] = lem_q
Пример #3
0
        removeStopWords=args["removeStopWords"],
        doSpellingCorrection=args["doSpellingCorrection"],
        removeNewLine=args["removeNewLine"],
        removePunctuation=args["removePunctuation"],
        removeHtmlTags=args["removeHtmlTags"],
        minTextLength=args["minTextLength"])
    predict_df["processed"] = preprocessor.fit_transform(
        predict_df["text_german"])
    predict_df = predict_df.dropna(subset=["processed"], axis=0)

    print("Tokenize")
    tokenizer = Tokenizer(tokenizeStr=preperation_technique,
                          ngram=preperation_ngram,
                          fasttextFile=args["fasttext_file"],
                          doLower=args["doLower"])
    predict_df["processed"] = tokenizer.fit_transform(predict_df["processed"])

    ## for testing purposes
    #train_df = train_df.sample(100)
    #val_df = val_df.sample(20)
    #test_df = test_df.sample(20)

    ## apply the model
    labels = [
        "price_pos", "price_neg", "quality_pos", "quality_neg",
        "restaurant_pos", "restaurant_neg", "food_pos", "food_neg",
        "drinks_pos", "drinks_neg", "ambience_pos", "ambience_neg",
        "service_pos", "service_neg"
    ]
    sentimentDict = {
        "pos": "positiv",
Пример #4
0
        val_df.to_pickle(val_pre_path)
        test_df.to_pickle(test_pre_path)
    else:
        train_df = pd.read_pickle(train_pre_path)
        val_df = pd.read_pickle(val_pre_path)
        test_df = pd.read_pickle(test_pre_path)
        ## get data and train columns
        data_column = list(set(train_df.columns) - set(args["targets"]))[0]

    if run_tokenization:
        ## do tokenization
        print("Tokenize")
        tokenizer = Tokenizer(tokenizeStr=tokenizer_model[0],
                              fasttextFile=args["fasttext_file"],
                              doLower=args["doLower"])
        train_df[data_column] = tokenizer.fit_transform(train_df[data_column])
        val_df[data_column] = tokenizer.transform(val_df[data_column])
        test_df[data_column] = tokenizer.transform(test_df[data_column])

        ## save the preprocessed data
        if not os.path.exists(os.path.join(args["data_path"], "temp")):
            os.makedirs(os.path.join(args["data_path"], "temp"))
        train_df.to_pickle(train_tok_path)
        val_df.to_pickle(val_tok_path)
        test_df.to_pickle(test_tok_path)

    else:
        train_df = pd.read_pickle(train_tok_path)
        val_df = pd.read_pickle(val_tok_path)
        test_df = pd.read_pickle(test_tok_path)
Пример #5
0
        np.save(train_pre_path.format("data"), train_data, allow_pickle=True)
        np.save(val_pre_path.format("data"), val_data, allow_pickle=True)
        np.save(test_pre_path.format("data"), test_data, allow_pickle=True)
        np.save(train_pre_path.format("target"),
                train_target,
                allow_pickle=True)
        np.save(val_pre_path.format("target"), val_target, allow_pickle=True)
        np.save(test_pre_path.format("target"), test_target, allow_pickle=True)

    if run_tokenization:
        ## do tokenization
        print("Tokenize")
        tokenizer = Tokenizer(args=tokenizer_model,
                              fasttextFile=args["fasttext_file"],
                              doLower=args["doLower"])
        train_data = tokenizer.fit_transform(train_data)
        val_data = tokenizer.transform(val_data)
        test_data = tokenizer.transform(test_data)

        ## save the preprocessed data
        if not os.path.exists(os.path.join(args["data_path"], "temp")):
            os.makedirs(os.path.join(args["data_path"], "temp"))
        if sparse.issparse(train_data):
            sparse.save_npz(train_tok_path.format("data"), train_data)
        else:
            np.save(train_tok_path.format("data"), train_data)
        np.save(train_tok_path.format("target"), train_target)
        if sparse.issparse(val_data):
            sparse.save_npz(val_tok_path.format("data"), val_data)
        else:
            np.save(val_tok_path.format("data"), val_data)