示例#1
0
def create_sentence_files():
    stop_words = set(
        pd.read_csv('./Primary_data/PersianStopWordList.txt', header=None)[0])
    questions = QuickDataFrame.read_csv('./Primary_data/result_filtered.csv',
                                        sep=';')
    topics = QuickDataFrame.read_csv('./Primary_data/topic_vector_Q.csv')

    files = dict()
    for tpc in topics.cols:
        files[tpc + '-p'] = open('./Primary_data/sent_topic/' + tpc + '.p',
                                 'w',
                                 encoding='utf-8')
        files[tpc + '-n'] = open('./Primary_data/sent_topic/' + tpc + '.n',
                                 'w',
                                 encoding='utf-8')

    prog = Progresser(len(questions['sentence']))
    # build the train data
    for i, qrow in enumerate(questions['sentence']):
        prog.count()
        snt = []
        for word in tokenise(qrow):
            if word not in stop_words:
                snt.append(word)
        snt = ' '.join(snt)
        for tpc in topics.cols:
            if topics[tpc][i] == '0':
                files[tpc + '-n'].write(snt + '\n')
            elif topics[tpc][i] == '1':
                files[tpc + '-p'].write(snt + '\n')
            else:
                print("wattt")

    for fl in files.values():
        fl.close()
示例#2
0
def find_frequent_words():
    data = pd.read_csv('./StackExchange_data/all_data.csv')
    words = dict()
    lemmatiser = WordNetLemmatizer()
    stop_words = set()
    for w in stopwords.words('english'):
        stop_words.add(w)

    # with open('./StackExchange_data/words50000.csv', 'r') as infile:
    #     for line in infile:
    #         w, _, f = line.partition(',')
    #         words[w] = int(f)

    p = Progresser(data.shape[0])
    cleaner = re.compile('^\s*-*|-\s*$')

    for i, row in data.iterrows():
        # if i <= 50000:
        #     continue
        p.show_progress(i)

        tokens_pos = pos_tag(word_tokenize(row['body']))
        for word_pos in tokens_pos:
            if len(word_pos[0]) < 2:
                continue

            word = word_pos[0].lower()
            word = re.sub(cleaner, '', word)

            if word in stop_words:
                continue

            if len(word) > 2:
                word = lemmatiser.lemmatize(word=word,
                                            pos=get_wordnet_pos(word_pos[1]))

            if word not in stop_words:
                if word in words:
                    words[word] += 1
                else:
                    words[word] = 1

        if i % 5000 == 0:
            with open('./StackExchange_data/words' + str(i) + '.csv',
                      'w') as outfile:
                for word, freq in words.items():
                    outfile.write(word + ',' + str(freq) + '\n')

    sorted_words = sorted(words, key=lambda x: words[x], reverse=True)

    with open('./StackExchange_data/words_frequency.csv', 'w') as outfile:
        for word in sorted_words:
            try:
                outfile.write(str(word) + ',' + str(words[word]) + '\n')
            except:
                pass

    with open('./StackExchange_data/1000words.csv', 'w') as outfile:
        for word in sorted_words[:1000]:
            outfile.write(str(word) + '\n')
示例#3
0
def create_w2v_vectors():
    # with open('./word2vec/IRBlog/w2v_per_300.pkl', 'rb') as infile:
    with open('./word2vec/Mixed/w2v_per.pkl', 'rb') as infile:
        w2v = pickle.load(infile)
    w2v_length = 100  # 300
    stop_words = set(
        pd.read_csv('./Primary_data/PersianStopWordList.txt', header=None)[0])
    questions = pd.read_csv('./Primary_data/result_filtered.csv',
                            delimiter=';')

    train = QuickDataFrame(['w' + str(i) for i in range(0, w2v_length)])

    prog = Progresser(questions.shape[0])
    # build the train data
    for i, qrow in questions.iterrows():
        prog.count()
        sum_array = np.zeros(w2v_length)
        number_of_words = 0

        for word in tokenise(qrow['sentence']):
            if word not in stop_words and word in w2v:
                number_of_words += 1
                sum_array += w2v[word]
        if i != len(train):
            print('wat?!!')
        train.append(list(sum_array / number_of_words))

    train.to_csv('./Primary_data/w2v-100_vector_Q.csv')
def lemmatise_all():
    id_mappings = pd.read_csv('./EurLex_data/eurlex_ID_mappings.csv', sep='\t')

    lemmatiser = WordNetLemmatizer()
    stop_words = set()
    for w in stopwords.words('english'):
        stop_words.add(w)
    cleaner = re.compile('^\s*-*|-\s*$')

    prog = Progresser(id_mappings.shape[0])

    for i, row in id_mappings.iterrows():
        prog.count()
        try:
            # if file already processed then continue
            if os.path.isfile('./EurLex_data/lem_txt/' + str(row['DocID']) +
                              '-lem.txt'):
                continue

            try:
                with open('./EurLex_data/eurlex_txt/' + str(row['DocID']) +
                          '.txt',
                          'r',
                          encoding="utf8") as infile:
                    raw_text = infile.read()
            except:
                continue

            lemmatised_doc = ''

            # lemmatise each sentence
            for sent in sent_tokenize(raw_text):
                lemmatised_sent = ''
                tokens_pos = pos_tag(word_tokenize(sent))

                # lemmatise each word in sentence
                for word_pos in tokens_pos:
                    if len(word_pos[0]) < 2: continue

                    word = word_pos[0].lower()
                    word = re.sub(cleaner, '', word)
                    if word in stop_words: continue

                    if len(word) > 2:
                        word = lemmatiser.lemmatize(word=word,
                                                    pos=get_wordnet_pos(
                                                        word_pos[1]))
                        if word in stop_words: continue

                    lemmatised_sent += word + ' '
                lemmatised_doc += lemmatised_sent + '\n'
            # write doc to file
            with open('./EurLex_data/lem_txt/' + str(row['DocID']) +
                      '-lem.txt',
                      'w',
                      encoding="utf8") as outfile:
                outfile.write(lemmatised_doc)
        except Exception as e:
            print(e)
def inter_cross_validation(x, y, algs, k=10):
    print('ytrain.shape:', y.shape)
    # create a k fold with no unique classes
    count = 0
    while True:
        count += 1
        kf = list(
            KFold(n_splits=k, shuffle=True,
                  random_state=randint(0, 100000)).split(x))
        good_folds = True
        for train_index, test_index in kf:
            for i in range(len(y[0])):
                if len(np.unique(y[train_index, i])) < 2:
                    print(i)
                    good_folds = False
                    break
            if not good_folds:
                break
        if good_folds:
            break

    fold_num = 0
    f1scr = {alg: [] for alg in algs.keys()}

    prog = Progresser(k)
    for train_index, test_index in kf:
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]

        for alg_name, alg_cls in algs.items():
            topic_classifier = BinaryRelevance(classifier=alg_cls,
                                               require_dense=[True, True])
            try:
                topic_classifier.fit(x_train, y_train)
            except Exception as e:
                print('\nfit error!:', e, alg_name)
                continue
            try:
                predictions = topic_classifier.predict(x_test)
                f1scr[alg_name].append(
                    f1_score(y_test, predictions, average='macro'))
                print('--', alg_name, f1scr[alg_name])
            except Exception as e:
                print('Eval error!:', e)
        fold_num += 1
        prog.count()

    best_alg = ''
    best_score = 0
    for alg_name, score in f1scr.items():
        mean_score = np.mean(score)
        if mean_score > best_score:
            best_alg = alg_name
            best_score = mean_score

    print(best_alg, best_score, '+-', np.std(f1scr[best_alg]))
    return best_alg
def find_frequent_words():
    id_mappings = QuickDataFrame.read_csv(
        './EurLex_data/eurlex_ID_mappings.csv', sep='\t')
    words = dict()

    prog = Progresser(len(id_mappings))
    for i in range(len(id_mappings)):
        prog.count()
        try:
            # read the file
            try:
                with open('./EurLex_data/lem_txt/' +
                          str(id_mappings['DocID'][i]) + '-lem.txt',
                          'r',
                          encoding="utf8") as infile:
                    doc_text = infile.read()
            except IOError as e:
                # print(e)
                continue
            # count the words
            for word in word_tokenize(doc_text):
                if word in words:
                    words[word] += 1
                else:
                    words[word] = 1
        except Exception as e:
            print(e)

    # remove bad words
    cleaner = re.compile('^(19\d\d)$|^(2\d\d\d)$|^((?!\d)\w)*$')
    filtered_words = dict()
    for word, freq in words.items():
        if cleaner.match(word):
            filtered_words[word] = freq

    sorted_words = sorted(filtered_words,
                          key=lambda x: filtered_words[x],
                          reverse=True)

    with open('./EurLex_data/words_frequency.csv', 'w',
              encoding="utf8") as outfile:
        for word in sorted_words:
            try:
                outfile.write(str(word) + ',' + str(words[word]) + '\n')
            except Exception as e:
                print(word, e)
                pass

    with open('./EurLex_data/1000words.csv', 'w', encoding="utf8") as outfile:
        for word in sorted_words[:1000]:
            outfile.write(str(word) + '\n')
def process_html_files():
    id_mappings = pd.read_csv('./EurLex_data/eurlex_ID_mappings.csv', sep='\t')

    no_file_num = 0
    no_en_num = 0
    no_texte_num = 0
    prog = Progresser(id_mappings.shape[0])

    for i, row in id_mappings.iterrows():
        prog.count()
        try:
            # if file already processed then continue
            if os.path.isfile('./EurLex_data/eurlex_txt/' + str(row['DocID']) +
                              '.txt'):
                continue

            # read the html
            with open('./EurLex_data/eurlex_html_EN_NOT/' +
                      row['Filename'].replace(':', '_'),
                      'r',
                      encoding="utf8") as infile:
                html = infile.read()

            # extract raw text
            soup = BeautifulSoup(html, 'html.parser')
            elem = soup.findAll('div', {'class': 'texte'})
            if len(elem) == 0:
                no_texte_num += 1
                continue
            raw_text = elem[0].text.strip()
            if raw_text.startswith('/* There is no English') or raw_text == '':
                no_en_num += 1
                continue

            # write the text into a new file
            with open('./EurLex_data/eurlex_txt/' + str(row['DocID']) + '.txt',
                      'w',
                      encoding="utf8") as outfile:
                outfile.write(raw_text)

        except IOError:
            no_file_num += 1
        except Exception as e:
            print(e)

    print('NO texte:', no_texte_num)
    print('NO EN:', no_en_num)
    print('NO file:', no_file_num)
def build_w2v_vectors():
    with open('./word2vec/word2vec-En.pkl', 'rb') as infile:
        w2v = pickle.load(infile)

    w2v_length = 300
    stop_words = set()
    for w in stopwords.words('english'):
        stop_words.add(w)

    id_mappings = QuickDataFrame.read_csv(
        './EurLex_data/eurlex_ID_mappings.csv', sep='\t')

    # create DataFrame
    cols_list = ['doc_id'] + ['w' + str(i) for i in range(0, w2v_length)]
    train = QuickDataFrame(columns=cols_list)

    prog = Progresser(len(id_mappings))
    for i in range(len(id_mappings)):
        prog.count()
        # read the file
        try:
            with open('./EurLex_data/lem_txt/' + str(id_mappings['DocID'][i]) +
                      '-lem.txt',
                      'r',
                      encoding="utf8") as infile:
                doc_text = infile.read()
        except IOError:
            continue
        try:
            sum_array = np.zeros(w2v_length)
            number_of_words = 0

            for word in word_tokenize(doc_text):
                if word not in stop_words and word in w2v:
                    number_of_words += 1
                    sum_array += w2v[word]
            if number_of_words > 0:
                sum_array = sum_array / number_of_words

            train.append([id_mappings['DocID'][i]] + list(sum_array))

        except Exception as e:
            print(e)

    train.to_csv('./EurLex_data/w2v_vector_Q.csv')
示例#9
0
def build_tag_vectors():
    data = pd.read_csv('./StackExchange_data/all_data.csv')
    topics = pd.read_csv('./StackExchange_data/tags.csv')

    cols_list = list(topics['term']) + ['question_id']
    data = data.set_index('id')

    train_arr = np.zeros((data.shape[0], len(cols_list)), dtype=np.int16)
    col_index = dict()
    for ind, col in enumerate(cols_list):
        col_index[col] = ind

    # build the train data
    print('Building train data...')
    p = Progresser(data.shape[0])
    for i, qrow in data.iterrows():
        p.show_progress(i)

        train_arr[i][col_index['question_id']] = i
        # set occurrence values
        row_tags = eval(qrow['tag'])
        for tp in row_tags:
            try:
                train_arr[i][col_index[tp]] = 1
            except Exception as e:
                # print(e)
                pass

    # write to file
    print('\nWriting to file...')
    with open('./StackExchange_data/data_tags.csv', 'w') as outfile:
        for ind, col in enumerate(cols_list):
            outfile.write(col)
            if ind == len(cols_list) - 1:
                outfile.write('\n')
            else:
                outfile.write(',')
        p = Progresser(data.shape[0])
        line_num = 0
        line_len = len(train_arr[0])
        for line in train_arr:
            p.show_progress(line_num)
            line_num += 1
            for i in range(0, line_len):
                outfile.write(str(line[i]))
                if i == line_len - 1:
                    outfile.write('\n')
                else:
                    outfile.write(',')
def build_all_vectors():
    id_mappings = QuickDataFrame.read_csv(
        './EurLex_data/eurlex_ID_mappings.csv', sep='\t')
    subject_data = QuickDataFrame.read_csv(
        './EurLex_data/eurlex_id2class/id2class_eurlex_subject_matter.qrels',
        header=False,
        columns=['sub', 'doc_id', 'col2'],
        sep=' ')
    words_vector = QuickDataFrame.read_csv('./EurLex_data/1000words.csv',
                                           header=False,
                                           columns=['term'])
    topics = QuickDataFrame.read_csv('./EurLex_data/tags.csv')

    # train = QuickDataFrame.read_csv('./EurLex_data/w2v_vector_Q.csv')
    # train.set_index(train['doc_id'], unique=True)

    # create DataFrame
    cols_list = ['doc_id'] + list(words_vector['term'])
    train = QuickDataFrame(columns=cols_list)

    # filling word columns
    prog = Progresser(len(id_mappings))
    for i in range(len(id_mappings)):
        prog.count()
        try:
            # read the file
            try:
                with open('./EurLex_data/lem_txt/' +
                          str(id_mappings['DocID'][i]) + '-lem.txt',
                          'r',
                          encoding="utf8") as infile:
                    doc_text = infile.read()
            except IOError:
                continue

            # add a new row
            train.append(value=0)

            # complete the data in that row
            train['doc_id'][len(train) - 1] = id_mappings['DocID'][i]
            for word in word_tokenize(doc_text):
                if word in train.data:
                    train[word][len(train) - 1] = 1
        except Exception as e:
            print(e)

    # index by doc id
    train.set_index(train['doc_id'], unique=True)

    # rename word columns
    rename_dict = dict()
    index = 0
    for wrd in list(words_vector['term']):
        rename_dict[wrd] = 'wrd' + str(index)
        index += 1
    train.rename(columns=rename_dict)

    # add topic columns
    for col in list(topics['term']):
        train.add_column(name=col, value=0)

    # filling topic columns
    for i in range(len(subject_data)):
        try:
            sub = subject_data['sub'][i]
            doc_id = subject_data['doc_id'][i]
            train[sub, doc_id] = 1
        except Exception as e:
            print(e)

    # rename topic columns
    rename_dict = dict()
    index = 0
    for tpc in list(topics['term']):
        rename_dict[tpc] = 'tpc' + str(index)
        index += 1
    train.rename(columns=rename_dict)

    # write to file
    print('\nWriting to file...')
    # train.to_csv('./EurLex_data/eurlex_combined_vectors.csv')
    train.to_csv('./EurLex_data/eurlex_combined_vectors-w2v.csv')
def evaluate_model_cnn(x, y, learn_path, k=10):
    print(len(y), len(y[0]))
    # create a k fold with no unique classes
    count = 0
    while True:
        count += 1
        # print(count, 'Finding a proper KF...')
        kf = list(
            KFold(n_splits=k, shuffle=True,
                  random_state=randint(0, 100000)).split(x))
        good_folds = True
        for train_index, test_index in kf:
            for i in range(len(y[0])):
                if len(np.unique(
                        y[train_index,
                          i])) < 2:  # or len(np.unique(y[test_index, i])) < 2:
                    # print(y[train_index, i],np.unique(y[train_index, i]))
                    print(i)
                    good_folds = False
                    break
            if not good_folds:
                break
        if good_folds:
            break
    print('Found a good KF in', count, 'try!')

    with open(learn_path + 'topic_classifier-folds.pkl', 'wb') as out_file:
        pickle.dump(kf, out_file)
    fold_num = 0

    stats = QuickDataFrame([
        'Jaccard (normalised)', 'Accuracy (normalised)', 'Accuracy',
        'F1_score (micro averaged)', 'F1_score (macro averaged by labels)',
        'F1_score (averaged by samples)', 'Hamming loss', 'Label Ranking loss:'
    ])

    txt_cnns = [TextCNN() for _ in range(y.shape[1])]
    prog = Progresser(k)
    for train_index, test_index in kf:
        # print(train_index, test_index)
        print('___________________________________________________')
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # cls = SVC(kernel='linear')
        # cls = SVC(kernel='poly', probability=True, tol=1e-5)
        # cls = GaussianNB()
        # cls = RandomForestClassifier(max_features='auto', random_state=1)

        # topic_classifier = BinaryRelevance(classifier=cls, require_dense=[True, True])

        # try:
        # topic_classifier.fit(x_train, y_train)
        for i in range(y.shape[1]):
            txt_cnns[i].train_cnn(train_index, i)
        # except Exception as e:
        #     print('\nfit error!:', e)
        #     continue

        # with open(learn_path + 'topic_classifier-SVC' + str(fold_num) + '.pkl', 'wb') as out_file:
        #     pickle.dump(topic_classifier, out_file)

        # try:
        predictions = np.zeros((len(x_test), y.shape[1]))
        for i in range(y.shape[1]):
            predictions[:, i] = np.array(txt_cnns[i].predict_text(test_index))

        s = [
            jaccard_score(y_test, predictions, average='micro'),
            accuracy_score(y_test, predictions, normalize=True),
            accuracy_score(y_test, predictions, normalize=False),
            f1_score(y_test, predictions, average='micro'),
            f1_score(y_test, predictions, average='macro'),
            f1_score(y_test, predictions, average='samples'),
            hamming_loss(y_test, predictions),
            label_ranking_loss(y_test, predictions)
        ]

        stats.append(s)
        print(stats[stats.length - 1])
        # except Exception as e:
        #     print('Eval error!:', e)

        fold_num += 1
        prog.count()

    for col in stats.cols:
        print(col, np.mean(stats[col]))
示例#12
0
def build_word_vectors():
    # data = pd.read_csv('./StackExchange_data/all_data.csv')
    data = pd.read_csv('./StackExchange_data/all_data-lemmatised.csv')
    words_vector = pd.read_csv('./StackExchange_data/1000words.csv',
                               header=None,
                               names={'term'})

    data = data.set_index(data['id'])
    # create DataFrame
    cols_list = list(words_vector['term']) + ['question_id']
    train = pd.DataFrame(dtype=object, columns=cols_list)

    # # lemmatise questions
    # lemmatiser = WordNetLemmatizer()
    # data['lem_body'] = ''
    # cleaner = re.compile('^\s*-*|-\s*$')
    # p = Progresser(data.shape[0])
    # for i, qrow in data.iterrows():
    #     p.show_progress(i)
    #
    #     question = ''
    #     tokens_pos = pos_tag(word_tokenize(qrow['body']))
    #     for word_pos in tokens_pos:
    #         word = word_pos[0].lower()
    #         word = re.sub(cleaner, '', word)
    #         if len(word) > 2:
    #             word = lemmatiser.lemmatize(word=word, pos=get_wordnet_pos(word_pos[1]))
    #         question += word + ' '
    #     data.loc[i, 'lem_body'] = question
    #     if i % 2000 == 0:
    #         data.to_csv('./StackExchange_data/all_data-lemmatised' + str(i) + '.csv', index=False)
    #
    # data.to_csv('./StackExchange_data/all_data-lemmatised.csv', index=False)
    # print('data lemmatised')

    train_arr = np.zeros((data.shape[0], len(cols_list)), dtype=np.int16)
    col_index = dict()
    for ind, col in enumerate(cols_list):
        col_index[col] = ind

    # build the train data
    print('Building train data...')
    p = Progresser(data.shape[0])
    for i, qrow in data.iterrows():
        p.show_progress(i)

        train_arr[i][col_index['question_id']] = i
        # set occurrence values
        for word in qrow['lem_body'].split():
            if word in col_index:
                train_arr[i][col_index[word]] = 1

    # write to file
    print('\nWriting to file...')
    with open('./StackExchange_data/data_1000word.csv', 'w') as outfile:
        for ind, col in enumerate(cols_list):
            outfile.write(col)
            if ind == len(cols_list) - 1:
                outfile.write('\n')
            else:
                outfile.write(',')
        p = Progresser(data.shape[0])
        line_num = 0
        line_len = len(train_arr[0])
        for line in train_arr:
            p.show_progress(line_num)
            line_num += 1
            for i in range(0, line_len):
                outfile.write(str(line[i]))
                if i == line_len - 1:
                    outfile.write('\n')
                else:
                    outfile.write(',')