Пример #1
0
def create_corpus_and_labels():
    negative_corpus = _load_corpus(filename=full_path(DATA_SET["negative"]))
    positive_corpus = _load_corpus(filename=full_path(DATA_SET["positive"]))
    negatives_number = len(negative_corpus)
    positives_number = len(positive_corpus)

    corpus = []
    labels = []
    for i in range(min(negatives_number, positives_number)):
        corpus.append(negative_corpus[i])
        labels.append([0])
        corpus.append(positive_corpus[i])
        labels.append([1])

    remainder = negative_corpus if negatives_number > positives_number else positive_corpus
    label = 0 if negatives_number > positives_number else 1
    remainder = remainder[min(negatives_number, positives_number):]

    labels += [label for _ in range(abs(positives_number - negatives_number))]
    corpus += remainder

    if PRINT_STATS:
        print(
            "average document length %d, negatives: %d, positives %d, total %d"
            % get_corpus_stats(corpus, negatives_number, positives_number))
        print(corpus[randrange(len(corpus))])
    return corpus, labels
Пример #2
0
def imdb_preprocess():
    base_directory = full_path("data/imdb/")

    pos_subdirectories = ["test/pos", "train/pos"]
    neg_subdirectories = ["test/neg", "train/neg"]

    neg_file_path = full_path("data/imdb.neg")
    pos_file_path = full_path("data/imdb.pos")

    create_corpus_file(base_directory, neg_file_path, neg_subdirectories)
    create_corpus_file(base_directory, pos_file_path, pos_subdirectories)
Пример #3
0
def remove_existing_tenders_from_fetch_list(bulletin_nums_to_fetch: dict):
    jsons_dir_path = full_path(DATA_SET_TENDERS['bzp_data_jsons_dir'])
    for json_dir_name in os.listdir(jsons_dir_path):
        json_subdir_path = jsons_dir_path + "/" + json_dir_name
        for json_tender_file_name in os.listdir(json_subdir_path):
            tender_bulletin_num = json_tender_file_name.replace(".json", "")

            for key in bulletin_nums_to_fetch.keys():
                if tender_bulletin_num in bulletin_nums_to_fetch[key]:
                    bulletin_nums_to_fetch[key].remove(tender_bulletin_num)
    return
def check_tender_lstm_network():
    w2v_model = Word2Vec.load(full_path('data/word2vecs/tenders_model'))
    model = keras.models.load_model(
        full_path(
            "lstm-net-backups/tenders_timestep150_drout0.4_rdrout0.4_batch64_short.h5"
        ))

    with open(full_path(DATA_SET_TENDERS['positive'])) as positive_corpus:
        positives_count = 0
        overall_count = 0
        for line in positive_corpus:
            tokens_pos = list(utils.tokenize(line, deacc=True, lower=True))
            document_pos = list(
                filter(lambda x: x not in STOP_LIST, tokens_pos))
            x_pos = np.array([document_to_batch(document_pos, w2v_model, 150)])
            overall_count += 1
            if evaluate(model, x_pos) == 'positive':
                positives_count += 1
            print(evaluate(model, x_pos))
        print('positives detected: ' + str(positives_count) + '/' +
              str(overall_count))
Пример #5
0
def show_document_length_histogram(bin_count=30):
    if DATA_SET == DATA_SET_TREC:
        corpus = _load_corpus(full_path("data/trec/trec.corp"))
    else:
        corpus, labels = create_corpus_and_labels()
    sentence_lengths = []
    for document in corpus:
        sentence_lengths.append(len(document))

    plt.title(DATA_SET['label'])
    plt.xlabel("word count")
    plt.ylabel("document count")

    plt.hist(sentence_lengths, bins=bin_count)
    plt.show()
def review_your_review():
    print("Wait for the google w2v model to load...")
    w2v_model = load_google_w2v_model()
    net_model = keras.models.load_model(
        full_path(
            "lstm-net-backups/imdb_timestep150_drout0.4_rdrout0.4_batch64.h5"))
    print("done")

    while True:
        line = input(
            "Type in your review or \"quit\" to finish then press ENTER: ")

        if line == 'quit':
            break

        tokens_line = list(utils.tokenize(line, deacc=True, lower=True))
        document_review = list(
            filter(lambda x: x not in STOP_LIST, tokens_line))
        line_numeric = np.array(
            [document_to_batch(document_review, w2v_model, 150)])

        print('I think this review is: ' + evaluate(net_model, line_numeric))
    print('Good bye')
Пример #7
0
def get_summaries_file_name(corpus_label: str):
    return full_path("log/{}_summary.log".format(corpus_label))
Пример #8
0
def get_csv_log_file_name(corpus_label: str):
    return full_path("log/{}_timestep{}_drout{}_rdrout{}_batch{}.csv".format(
        corpus_label, str(DATA_SET['time_steps']), str(DATA_SET['dropout']),
        str(DATA_SET["recurrent_dropout"]), str(DATA_SET["batch_size"])))
Пример #9
0
def get_vector_labels_file_name(corpus_label: str):
    return full_path("data/vector_words/" + corpus_label + "_labels.npy")
Пример #10
0
def get_vector_words_directory_for_dataset(corpus_label: str, dataset):
    return full_path("data/vector_words/" + corpus_label +
                     "_words_max_timestep" + str(dataset['max_time_steps']))
Пример #11
0
def get_w2v_file_name(corpus_label: str):
    return full_path("data/word2vecs/" + corpus_label + "_model")
Пример #12
0
def get_dictionary_file_name(corpus_label: str):
    return full_path("data/dicts/" + corpus_label + "_dict")
Пример #13
0
def get_tfidf_file_name(corpus_label: str):
    return full_path("data/tfidfs/" + corpus_label + "_tfidf_model")
Пример #14
0
def get_network_model_snapshot(corpus_label: str):
    return full_path(
        "lstm-net/{}_timestep{}_drout{}_rdrout{}_batch{}.h5".format(
            corpus_label, str(DATA_SET['time_steps']),
            str(DATA_SET['dropout']), str(DATA_SET["recurrent_dropout"]),
            str(DATA_SET['batch_size'])))
Пример #15
0
import numpy as np

from src.configuration import get_vector_labels_file_name, DATA_SET_TREC, \
    get_batch_file_name_for_dataset, get_vector_words_directory_for_dataset
from src.preprocessing.document_as_w2v_groups import document_to_batch
from src.preprocessing.w2v_loader import load_google_w2v_model
from src.utils.get_file import full_path, create_file_and_folders_if_not_exist

LABELS = {'DESC': [], 'HUM': [], 'ENTY': [], 'NUM': [], 'LOC': []}
CORPUS_FILE_NAME = full_path("data/trec/trec.corp")

# see http://cogcomp.org/Data/QA/QC/
TREC_TEXT_FILE_PATH = full_path("data/trec/trec.txt")
WORD_VECTORS_DIRECTORY = get_vector_words_directory_for_dataset(
    'trec', DATA_SET_TREC)
LABELS_FILE_PATH = get_vector_labels_file_name('trec')


def trec_preprocess():
    create_corpus_file()


def create_corpus_file():
    model = load_google_w2v_model()

    with open(TREC_TEXT_FILE_PATH, 'r', encoding='utf-8',
              errors='ignore') as file:
        iter_file = iter(file)
        for line in iter_file:
            label = line[:str(line).find(":")]
            if label in LABELS:
Пример #16
0
def get_vector_words_directory(corpus_label: str):
    return full_path("data/vector_words/" + corpus_label +
                     "_words_max_timestep" + str(DATA_SET['max_time_steps']))
Пример #17
0
def add_line_to_corpus(json_file_path, corpus_file_name):
    with open(json_file_path, encoding='utf-8') as json_file:
        json_object = json.load(json_file)
        data = re.sub('\s+', ' ', json_object['okreslenie_przedmiotu'].strip())
        data = re.sub('[^\w\s]|_]', ' ', data)
        data = unidecode(data)

        words = data.split()
        words = [word for word in words if len(word) > 2]

        with open(corpus_file_name, mode='a+', errors='ignore') as corpus_file:
            corpus_file.write(' '.join(words) + '\n')
        # print(words)


json_dir_observed = full_path(DATA_SET_TENDERS['bzp_data_jsons_dir'] +
                              '/observed_json/')
json_dir_viewed = full_path(DATA_SET_TENDERS['bzp_data_jsons_dir'] +
                            '/viewed_json/')
json_dir_reported = full_path(DATA_SET_TENDERS['bzp_data_jsons_dir'] +
                              '/reported_json/')

positive_corpus_path = full_path(DATA_SET_TENDERS['positive'])
negative_corpus_path = full_path(DATA_SET_TENDERS['negative'])

if DATA_SET_TENDERS == DATA_SET_TENDERS_SHORT:
    add_lines_to_corpus_short(json_dir_observed, positive_corpus_path)
    add_lines_to_corpus_short(json_dir_viewed, positive_corpus_path)
    add_lines_to_corpus_short(json_dir_reported, negative_corpus_path)
elif DATA_SET_TENDERS == DATA_SET_TENDERS_LONG:
    add_lines_to_corpus(json_dir_observed, positive_corpus_path)
    add_lines_to_corpus(json_dir_viewed, positive_corpus_path)
Пример #18
0
import json
import os

from src.configuration import DATA_SET_TENDERS
from src.preprocessing.iwium.iwium_bzb_api_client import fetch_data_daily
from src.utils.get_file import full_path

TRACKER_REPORTED_JSON = full_path(DATA_SET_TENDERS['tracker_dir'] + "/reported-offers.json")
TRACKER_OBSERVED_JSON = full_path(DATA_SET_TENDERS['tracker_dir'] + "/observed-offers.json")
TRACKER_VIEWED_JSON = full_path(DATA_SET_TENDERS['tracker_dir'] + "/viewed-offers.json")

OBSERVED_FILE_PATH_IDS = full_path(DATA_SET_TENDERS['tracker_dir'] + "/ids/observed_ids.txt")
REPORTED_FILE_PATH_IDS = full_path(DATA_SET_TENDERS['tracker_dir'] + "/ids/reported_ids.txt")
VIEWED_FILE_PATH_IDS = full_path(DATA_SET_TENDERS['tracker_dir'] + "/ids/viewed_ids.txt")

OBSERVED_BULLETIN_NUMBERS_PATH = full_path(DATA_SET_TENDERS['bzp_data_dir'] + "/bulletin_nums/observed_nums.txt")
REPORTED_BULLETIN_NUMBERS_PATH = full_path(DATA_SET_TENDERS['bzp_data_dir'] + "/bulletin_nums/reported_nums.txt")
VIEWED_BULLETIN_NUMBERS_PATH = full_path(DATA_SET_TENDERS['bzp_data_dir'] + "/bulletin_nums/viewed_nums.txt")


def parse_tracker_ids(tracker_data_file_path, out_valid_bulletin_numbers_file, out_ids_file):
    tender_ids = []
    with open(tracker_data_file_path, 'r', encoding='utf-8', errors='ignore') as file:
        tenders_stats = json.load(file)
        for stat in tenders_stats:
            tender_ids.append(stat['what'])

        tender_ids = set(tender_ids)
        tender_ids = sorted(tender_ids)

        with open(out_valid_bulletin_numbers_file, 'w') as bulletin_nums_file: