示例#1
0
def classify(docs):
    print("Classifying...")
    print("Preprocessing Train Data...")
    train_docs = read_docs('../data/phase2_train.csv')
    preprocessor = EnglishPreprocessor(train_docs)
    for doc in train_docs.values():
        doc.words = preprocessor.preprocess(doc.text)
    print("Indexing Train Data...")
    index = PositionalIndexer(train_docs, 1).index
    sliced_index = slice_index(index=index, n=200)
    sampled = {}
    sample_size = 500
    for i in random.sample(train_docs.keys(), sample_size):
        sampled[i] = train_docs[i]
    classifier = RFClassifier(sampled, sliced_index, len(train_docs))
    classifier.train()
    y_pred = classifier.classify(docs)
    doc_ids = [doc.id for doc in docs.values()]
    for i in range(len(doc_ids)):
        docs[doc_ids[i]].tag = y_pred[i]
示例#2
0
            print(str(current_param) + ":\t" + str(accuracy))
            if maximum_accuracy < accuracy:
                arg_max_param = current_param
                maximum_accuracy = accuracy

        return arg_max_param


class RFClassifier(SKLearnClassifier):
    def __init__(self, train_docs, train_index, index_doc_count):
        super().__init__(train_docs, train_index, index_doc_count)
        self.clf = RandomForestClassifier(n_estimators=100)


if __name__ == "__main__":
    train_docs = read_docs('../data/phase2_train.csv')
    test_docs = read_docs('../data/phase2_test.csv')

    preprocessor = EnglishPreprocessor(train_docs)

    for doc in train_docs.values():
        doc.words = preprocessor.preprocess(doc.text)
    for doc in test_docs.values():
        doc.words = preprocessor.preprocess(doc.text)

    print("Preprocess is done!")

    index = PositionalIndexer(train_docs, 1).index
    print("Index Created Successfully!")

    index_doc_count = len(train_docs)
示例#3
0
        tokens = [t for t in tokens if re.search('[a-zA-Z-]', t) is None]
        return tokens

    def __init__(self, docs):
        super().__init__(docs, Stemmer())


if __name__ == "__main__":
    # I'm reading this loud to this kids. These self-identifying kids nowadays read more than I ever did.
    # print("Stop Words:", find_stop_words("../data/English.csv"))
    # s = input()
    # ts = preprocess(s)
    # print(ts)
    # print(frequency_table(ts, 3))
    task = input("Select task: 1. Preprocess a text 2. Show frequent words")
    language = input("Select language: 1. English 2. Persian")
    if language == "1":
        docs = read_docs('../data/English.csv')
        preprocessor = EnglishPreprocessor(docs)
    else:
        docs = read_docs('../data/Persian.xml')
        preprocessor = PersianPreprocessor(docs)

    if task == "1":
        preprocessor.preprocess(input("Enter text:"), True)
    elif task == "2":
        preprocessor.print_high_freq_tokens()
#  print(preprocessor.preprocess("""
# Parmalat to sue auditors PARMALAT, the bankrupt Italian food company, is suing outside auditors Grant Thornton and Deloitte  amp; Touche, seeking 5.5 billion in damages.
#  """))