def score(self, text): if not isinstance(text, basestring): raise ValueError("score: text must be a string.") processed = TextPreprocessor(text) words = processed.get_words() polarity = TextBlob(processed.get_raw()).sentiment.polarity scores = [] for attr in self.packet.getAttributes(): attrScore = 0 for i in range(0, attr.get_size()): word = attr.get_word(i) multiword = len(word.split()) > 1 expectedSent = attr.get_sentiment_num(i) if polarity * expectedSent >= 0: significance = attr.get_weight_num(i) if multiword: spacedText = " ".join(processed.get_tokens()) attrScore += spacedText.count(word) * significance else: attrScore += words.count(word) * significance scores.append(attrScore) #Fill it up to 5 for i in range(len(scores), 5): scores.append(0) return scores
def _preprocess(self, data): preprocessor = TextPreprocessor('tt') x = [] y = [] for d in data: t, l, _ = preprocessor.pp(d) if t: x.append(t) y.append(l) else: continue return x, y, list(preprocessor.wordSet)
def _preprocess(self, project, dataType): preprocessor = TextPreprocessor(dataType) if dataType == 'UserManual': data = self.manuals else: data = self.issues for d in data: if dataType == 'UserManual': d.sentences = preprocessor.pp(d.sentences) else: d.title, _, d.body = preprocessor.pp( (d.title, d.lables, d.body)) return list(sorted(preprocessor.wordSet)), preprocessor.maxLen
def readfile(self): t = TextPreprocessor() # Gets the list of model answer and student ans # Format of model answer: {'1.1': ['word1', 'word2'], '1.2': ['word1', 'word2']] # Format of student answer: [['1.1', 'word1', 'word2'], ['1.1', 'word1', 'word2'], # ['1.2', 'word1', 'word2'], ['1.2', 'word1', 'word2']] model_answer = t.parse_file( "D:/Studies/USC/3rdSem/CSCI544/GroupProjectData/ShortAnswerGrading_v2.0/data/sent/answers", False) student_answers = t.parse_file( "D:/Studies/USC/3rdSem/CSCI544/GroupProjectData/ShortAnswerGrading_v2.0/data/sent/all", True) model_answer_list = [] for keys in model_answer.keys(): model_answer_list.append(model_answer[keys]) self.load_model(model_answer_list) dimensions = len(self.model[next(iter(self.model.vocab.keys()))]) self.convert_to_vectors(student_answers, model_answer, dimensions) for key in self.avg_model_answer_vector.keys(): print(key) model_vector = self.avg_model_answer_vector[key] writer = open( 'D:/Studies/USC/3rdSem/CSCI544/GroupProjectData/OutputFolderBrownSkipGram/' + key + '.txt', 'w') for answer in self.avg_student_answers_vector[key]: similarity_score = np.float64( self.calculate_similarity(model_vector, answer)).item() score = 0 if similarity_score >= 0.97: score = 5 elif similarity_score >= 0.89: score = 4.5 elif similarity_score >= 0.86: score = 4 elif similarity_score >= 0.85: score = 3 elif similarity_score >= 0.80: score = 2 elif similarity_score >= 0.75: score = 1 elif similarity_score > 0: score = 0.5 else: score = 0 writer.write(str(score) + '\n') writer.close()
def categorize_texts(texts): data_frame = pd.DataFrame(texts, columns=[Categorizer.text_col_name]) data_frame = TextPreprocessor.preprocess_texts( data_frame, col_name=Categorizer.text_col_name, new_col_name=Categorizer.prep_col_name, storage_level=0, storage_name='', log=1) data_frame = Vectorizer.vectorize( data_frame, col_name=Categorizer.prep_col_name, new_col_name=Categorizer.vec_col_name, storage_level=0, storage_name='', log=1) data_frame = Classifier.classify( data_frame, col_name=Categorizer.vec_col_name, new_col_name=Categorizer.classified_col_name, storage_level=0, storage_name='', log=1) data_frame = ClassificationInterpreter.interpret_output( data_frame, col_name=Categorizer.classified_col_name, new_col_name=Categorizer.result_col_name, storage_level=0, storage_name='', log=1) return data_frame[Categorizer.result_col_name].tolist()
def main(node_path, arguments_path, preprocessed_node_path): """ Load nodes and edges lists, select nodes appearing in the edge list and apply NLP transformations (lemmatization, pos tagging). """ node_path = Path(node_path) preprocessed_node_path = Path(preprocessed_node_path) arguments_path = Path(arguments_path) arguments_getter = lambda: utils.load(arguments_path) node_ids = set(( node_id for argument in arguments_getter() for node_id in argument[0].values())) node_generator_getter = lambda: utils.load(node_path) filtered_nodes_getter = lambda: filter( lambda node: node['n'] in node_ids, node_generator_getter()) text_preprocessor = TextPreprocessor() preprocessed_text_generator = text_preprocessor.transform( map(lambda node: node['s'], filtered_nodes_getter())) renamed_node_generator = map( lambda node: { 'id': node['n'], 'document': node['s'], 'debate_name': node['debate_name'] }, filtered_nodes_getter()) preprocessed_node_generator = utils.merge_dicts(zip( renamed_node_generator, preprocessed_text_generator)) utils.dump(tqdm(preprocessed_node_generator), preprocessed_node_path)
class TextPreprocessorTests(unittest.TestCase): def setUp(self): self.tpp = TextPreprocessor() #preprocess #pass def test000_000_preprocess_caps(self): test = "iNcoNSIstent CaPITAlizaTION" self.tpp.preprocess(test) #spellcheck #pass def test100_000_spellcheck2(self): testSentence = "I havv goood speeling" correctSentence = "I have good spelling" self.assertEquals(self.tpp.spellcheck(testSentence), correctSentence) def test100_001_spellcheck2(self): testSentence = "omg i dont kno wuts hapening" correctSentence = "omg i dont know whats happening" self.assertEquals(self.tpp.spellcheck(testSentence), correctSentence) def test100_002_spellcheck3(self): testSentence = "test tets tast yest" correctSentence = "test test test test" self.assertEquals(self.tpp.spellcheck(testSentence), correctSentence) def test100_003_spellcheck4(self): testSentence = "dos anyon hav a dolar" correctSentence = "does anyone have a dollar" self.assertEquals(self.tpp.spellcheck(testSentence), correctSentence)
def score(self, text): processed = TextPreprocessor(text) bagOfWords = processed.get_words() #LINE WILL CHANGE polarity = TextBlob(processed.get_raw()).sentiment.polarity scores = [] for attr in self.packet.getAttributes(): attrScore = 0 for i in range(0, attr.get_size()): expectedSent = attr.get_sentiment_num(i) if polarity * expectedSent >= 0: word = attr.get_word(i) significance = attr.get_weight_num(i) attrScore += bagOfWords.count(word) * significance scores.append(attrScore) #Fill it up to 5 for i in range(len(scores), 5): scores.append(0) return scores
def score(self, text): processed = TextPreprocessor(text) bagOfWords = processed.get_words() #LINE WILL CHANGE polarity = TextBlob(processed.get_raw()).sentiment.polarity scores = [] for attr in self.packet.getAttributes(): attrScore = 0 for i in range(0, attr.get_size()): expectedSent = attr.get_sentiment_num(i) if polarity * expectedSent >= 0: word = attr.get_word(i) significance = attr.get_weight_num(i) attrScore += bagOfWords.count(word) * significance scores.append(attrScore) #Fill it up to 5 for i in range(len(scores),5): scores.append(0) return scores
def readfile(self): t = TextPreprocessor() # Gets the list of model answer and student ans # Format of model answer: {'1.1': ['word1', 'word2'], '1.2': ['word1', 'word2']] # Format of student answer: [['1.1', 'word1', 'word2'], ['1.1', 'word1', 'word2'], # ['1.2', 'word1', 'word2'], ['1.2', 'word1', 'word2']] model_answer = t.parse_file( "/Users/manali/Desktop/Himani_NLP/ShortAnswerGrading_v2.0/data/sent/answers", False) student_answers = t.parse_file( "/Users/manali/Desktop/Himani_NLP/ShortAnswerGrading_v2.0/data/sent/all", True) self.load_model() self.convert_to_vectors(student_answers, model_answer) for key in self.avg_model_answer_vector.keys(): model_vector = self.avg_model_answer_vector(key) with open( '/Users/manali/Desktop/Himani_NLP/OutputFolderWiki' + key + '.txt', 'w') as write_file: for answer in self.avg_student_answers_vector[key]: write_file.write( self.calculate_similarity(model_vector, answer) + '\n')
def __init__(self, stopwords_file, organisations_file, org_ratio, org_headersize): self.tp = TextPreprocessor(stopwords_file) self.fe = FeatureExtractor(organisations_file, self.tp, org_ratio, org_headersize) self.trie = "" self.weights = "" self.org_classifier = "" self.respa_classifier = "" self.respa_used_features = [ 'MatchedPatternsCount', 'UnqMatchedPatternsCount', 'TotalMatchingCharacters', 'SumMatchingEntries', 'SumMatchingEntriesLength', 'MatchingUnigrams', 'MatchingBigrams', 'TotalMatchingRatio', 'UnqMatchingUnigrams', 'UnqMatchingBigrams', 'AlphaBulletDot', 'AlphaBulletPar', 'AlphaBulletCapDot', 'AlphaBulletCapPar', 'DigitBulletDot', 'DigitBulletPar' ] self.org_used_features = [ 'BulletDepartment', 'OrgTotalMatchingCharacters', 'OrgMatchingUnigrams', 'OrgMatchingBigrams', 'AlphaBulletDot', 'AlphaBulletPar', 'AlphaBulletCapDot', 'AlphaBulletCapPar', 'DigitBulletDot', 'DigitBulletPar' ]
def readfile(self): t = TextPreprocessor() # Gets the list of model answer and student ans # Format of model answer: {'1.1': ['word1', 'word2'], '1.2': ['word1', 'word2']] # Format of student answer: [['1.1', 'word1', 'word2'], ['1.1', 'word1', 'word2'], # ['1.2', 'word1', 'word2'], ['1.2', 'word1', 'word2']] model_answer = t.parse_file( "D:/Studies/USC/3rdSem/CSCI544/GroupProjectData/ShortAnswerGrading_v2.0/data/sent/answers", False) student_answers = t.parse_file( "D:/Studies/USC/3rdSem/CSCI544/GroupProjectData/ShortAnswerGrading_v2.0/data/sent/all", True) self.load_model() self.convert_to_vectors(student_answers, model_answer) for key in self.avg_model_answer_vector.keys(): model_vector = self.avg_model_answer_vector(key) with open( 'D:/Studies/USC/3rdSem/CSCI544/GroupProjectData/OutputFolderGoogleNews' + key + '.txt', 'w') as write_file: for answer in self.avg_student_answers_vector[key]: write_file.write( self.calculate_similarity(model_vector, answer) + '\n')
def __init__(self): self.path_train_raw = ROOT_DIR + '/data/SemEval2016-Task5-ABSA/SB1/REST/ABSA16_Restaurants_Train_SB1_v2.xml' self.path_test_raw = ROOT_DIR + '/data/SemEval2016-Task5-ABSA/SB1/REST/EN_REST_SB1_TEST.xml.gold' self.train_file_name = 'SemEval2016_task5_subtask1_train_ready' self.test_file_name = 'SemEval2016_task5_subtask1_test_ready' self.path_train_ready = ROOT_DIR + '/data/' + self.train_file_name self.path_test_ready = ROOT_DIR + '/data/' + self.test_file_name self.dependency_tagged_sentences = ROOT_DIR + '/data/' + 'all_tagged_sentences' self.stanford_input_name = "inputStanNLP.txt" self.text_preprocessor = TextPreprocessor() if do_files_exist(self.path_train_ready, self.path_test_ready): self.ready_train = load_gzip(self.path_train_ready) self.ready_test = load_gzip(self.path_test_ready) else: self.load_train_and_test() if do_files_exist(self.dependency_tagged_sentences): self.ready_tagged = load_gzip(self.dependency_tagged_sentences) self.ready_tagged_train = self.ready_tagged[0:TRAIN_SENTENCES] self.ready_tagged_test = self.ready_tagged[TRAIN_SENTENCES::] else: self.prepare_tagged_sentences() self.ready_tagged = load_gzip(self.dependency_tagged_sentences)
def run_setup(run_import=1, run_preprocessing=1, run_vectorization=1, run_classification=1): corpus_id = SessionConfigReader.read_value(SetupRunner.corpus_id_key) if run_import: Storage.delete_session_data() SessionLogger.clear() identifier = CorpusImporter.import_docs() df = Storage.load_pd_frame(identifier) StopwordDownloaderNLTK.get_stopwords() else: df = Storage.load_pd_frame(corpus_id) if run_preprocessing: df = TextPreprocessor.preprocess_texts(df, storage_level=1, storage_name=corpus_id) else: df = Storage.load_pd_frame(corpus_id+SetupRunner.ext_preprocessed) if run_vectorization: Storage.delete_model(SessionConfigReader.read_value(SetupRunner.vec_model_id_key)) Vectorizer.create_model(df) df = Vectorizer.vectorize(df, storage_level=1, storage_name=corpus_id) else: df = Storage.load_pd_frame(corpus_id+SetupRunner.ext_vectorized) if run_classification: Storage.delete_h5_model(SessionConfigReader.read_value(SetupRunner.keras_nn_model_id_key)) df = ClassificationInterpreter.create_out_vectors(df, storage_level=1, storage_name=corpus_id) Classifier.create_model(df)
class SemEvalData(): '''SemEval2016_task5_subtask1''' def __init__(self): self.path_train_raw = ROOT_DIR + '/data/SemEval2016-Task5-ABSA/SB1/REST/ABSA16_Restaurants_Train_SB1_v2.xml' self.path_test_raw = ROOT_DIR + '/data/SemEval2016-Task5-ABSA/SB1/REST/EN_REST_SB1_TEST.xml.gold' self.train_file_name = 'SemEval2016_task5_subtask1_train_ready' self.test_file_name = 'SemEval2016_task5_subtask1_test_ready' self.path_train_ready = ROOT_DIR + '/data/' + self.train_file_name self.path_test_ready = ROOT_DIR + '/data/' + self.test_file_name self.dependency_tagged_sentences = ROOT_DIR + '/data/' + 'all_tagged_sentences' self.stanford_input_name = "inputStanNLP.txt" self.text_preprocessor = TextPreprocessor() if do_files_exist(self.path_train_ready, self.path_test_ready): self.ready_train = load_gzip(self.path_train_ready) self.ready_test = load_gzip(self.path_test_ready) else: self.load_train_and_test() if do_files_exist(self.dependency_tagged_sentences): self.ready_tagged = load_gzip(self.dependency_tagged_sentences) self.ready_tagged_train = self.ready_tagged[0:TRAIN_SENTENCES] self.ready_tagged_test = self.ready_tagged[TRAIN_SENTENCES::] else: self.prepare_tagged_sentences() self.ready_tagged = load_gzip(self.dependency_tagged_sentences) def get_all_sentences(self): train_sentences = self.get_train_sentences() test_sentences = self.get_test_sentences() return np.concatenate((train_sentences, test_sentences)) def get_train_x_y_test_x_y(self, preprocessing_options=None): #TODO right now a string is returned as train x, # change that but add the preprocessing option train_x_train_y_test_x_test_y = [] for data in [self.ready_train, self.ready_test]: x, y = [], [] for e in data.values(): sentence = e['sentence'] aspect_categories = [] for opinion in e['opinions']: aspect_categories.append(opinion['category']) # some sentences have more opinions of the same type. # Ignored and treated as one instance. aspect_categories = list(set(aspect_categories)) x.append(sentence) y.append(np.array(aspect_categories)) train_x_train_y_test_x_test_y.append(np.array(x)) train_x_train_y_test_x_test_y.append(np.array(y)) return train_x_train_y_test_x_test_y def make_multilabel_1hot_vector(self, aspect_categories): multiclass_label = np.zeros(12) for ac in aspect_categories: multiclass_label = np.add(multiclass_label, ASPECT_CATEGORIES[ac]) assert_is_one_hot_vector(multiclass_label) return multiclass_label def get_y_train_and_test_multilabel(self): raw = self.get_train_x_y_test_x_y() y_train = [self.make_multilabel_1hot_vector(l) for l in raw[1]] y_test = [self.make_multilabel_1hot_vector(l) for l in raw[3]] return np.array(y_train), np.array(y_test) def get_x_sow_and_y_onehot(self, embedding): y_train, y_test = self.get_y_train_and_test_multilabel() raw = self.get_train_x_y_test_x_y() x_train = [embedding.get_SOW(s) for s in raw[0]] x_test = [embedding.get_SOW(s) for s in raw[2]] return x_train, y_train, x_test, y_test def get_x_embs_and_y_onehot(self, embedding, pad=True, pad_size=80): def get_embeddings(sentences): all_embeddings = [] for s in sentences: embs = embedding.get_word_emb_list(s) if len(embs) == 0: # No embeddings found for sentence, ignore it continue if pad: padded = pad_array(embs, pad_size) all_embeddings.append(padded) return np.array(all_embeddings) raw = self.get_train_x_y_test_x_y() x_train = get_embeddings(raw[0]) x_test = get_embeddings(raw[2]) y_train, y_test = self.get_y_train_and_test_multilabel() return x_train, y_train, x_test, y_test def load_train_and_test(self): ''' Creates files of this format: {sentence_id: { 'sentence':[sentence], 'opinions':[{}, {}]} By default LOWERCASING and DECONTRACTION (I've --> I have) are applied ''' removed = 0 for file_in, file_out in [(self.path_train_raw, self.path_train_ready), (self.path_test_raw, self.path_test_ready)]: opinion_count = 0 ready = {} reviews = xml.etree.ElementTree.parse(file_in).getroot() for review in reviews: for sentences in review: for sentence in sentences: ready[sentence.get("id")] = {} this_id = sentence.get("id") for data in sentence: if data.tag == "text": this_sentence = data.text this_sentence = \ self.text_preprocessor.do_decontraction(this_sentence) this_sentence = [ w.lower() for w in this_sentence ] ready[this_id]['sentence'] = this_sentence ready[this_id]['opinions'] = [] if data.tag == 'Opinions': for opinion in data: opinion_count += 1 ready[this_id]['opinions'].append({ 'target': opinion.get("target"), 'category': opinion.get("category"), 'polarity': opinion.get("polarity"), 'to': int(opinion.get("to")), 'from': int(opinion.get("from")) }) if len(ready[this_id]['opinions']) == 0: del ready[ this_id] # remove sentence with no opinions removed += 1 # sanity check: we got as many opinion tuples as the SemEval paper says assert (opinion_count == TEST_OPINIONS or opinion_count == TRAIN_OPINIONS) dump_gzip(ready, file_out) self.ready_train = load_gzip(self.path_train_ready) self.ready_test = load_gzip(self.path_test_ready) # sanity check: we got as many sentences as the SemEval paper says assert (len(self.ready_train) + len(self.ready_test) == TEST_SENTENCES + TRAIN_SENTENCES) def get_test_sentences(self): s = self.get_sentences(self.ready_test) assert (len(s) == TEST_SENTENCES) return s def get_train_sentences(self): s = self.get_sentences(self.ready_train) assert (len(s) == TRAIN_SENTENCES) return s def get_sentences(self, data): list_dictionaries = list(data.values()) sentences = [] for d in list_dictionaries: try: sentences.append(d['sentence']) except Exception: pass return np.array(sentences) def prepare_file_for_Stanford_parser(self): ''' Standford dependency parser wants a file where each sentences is on new line. Remmber to use right argument when calling the dependency parser, otherwise full stops are used as delimiters.''' sentences = self.get_all_sentences() if not isinstance(sentences[0], str): if len(sentences[0]) == 1: sentences = [sen[0] for sen in sentences] f = open(ROOT_DIR + '/data/' + self.stanford_input_name, 'w') for s in sentences: f.write(s) f.write('\n') f.close() def prepare_tagged_sentences( self, path=ROOT_DIR + '/data/' + 'all_dependencies.xml', dependency_type="enhanced-plus-plus-dependencies"): '''Takes as input a xml file containing standford annotated sentences. Such file can be obtained by calling the following command: java edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,depparse -file <INPUT_FILE> Where input file contain all the train and test sentences. ''' opinion_count = 0 ready = {} try: docs = xml.etree.ElementTree.parse(path).getroot() except Exception: self.prepare_file_for_Stanford_parser() raise (Exception( "You don't have the dependecy file. I made a file called " + self.stanford_input_name + " that you should feed to the standford " + "dependency tool to obtain the dependecy xml file")) for doc in docs: for sentences in doc: tagged_sentences = [] for s in sentences: sentence = [] for element in s: if element.tag == 'tokens': for token in element: for thing in token: if thing.tag == 'word': sentence.append(thing.text) if element.tag == 'dependencies': if element.attrib['type'] == dependency_type: all_dependencies = {} for dependency in element: dep_type = dependency.attrib['type'] for thing in dependency: if thing.tag == 'governor': governor = thing.text.lower() if thing.tag == 'dependent': dependent = thing.text.lower() all_dependencies.setdefault(governor, [])\ .append(dep_type + "_" + dependent) all_dependencies.setdefault(dependent, [])\ .append(dep_type + "_inv_" + governor) tagged_sentences.append( (sentence, all_dependencies)) #tagged_sentences.append(bu ild) assert (len(tagged_sentences) == TRAIN_SENTENCES + TEST_SENTENCES) dump_gzip(tagged_sentences, ROOT_DIR + '/data/' + 'all_tagged_sentences') def split_tagged_sentences_into_train_and_test(self): row = load_gzip(ROOT_DIR + '/data/' + 'all_tagged_sentences') train = [] test = [] for i in range(len(row)): if i < TRAIN_SENTENCES: train.append(np.array(row[i])) else: test.append(np.array(row[i])) return np.array(train), np.array(test) def make_vocabulary(self, sentences): all_words = [w for sentence in sentences for w in sentence] return set(all_words) def make_normal_vocabulary(self): ''' List of words used ''' return self.make_vocabulary(self.get_all_sentences()) def make_syntactical_vocabulary(self): '''Returns stuff like [case_of, det_the ... ] for all words and all different syntactical usages found in train + test data''' s = load_gzip(ROOT_DIR + '/data/all_tagged_sentences') d = [] for e in s: d.append(e[0]) for ws in list(e[1].values()): d.append(ws) return self.make_vocabulary(d) def get_data_syntax_concatenation_sow(self, komn): x_test, x_train = self.get_x_train_test_syntax(komn) x_train = [np.array(sum(e)) for e in x_train] x_test = [np.array(sum(e)) for e in x_test] y_train, y_test = self.get_y_train_and_test_multilabel() return x_train, y_train, x_test, y_test def get_data_syntax_concatenation(self, komn): x_test, x_train = self.get_x_train_test_syntax(komn, pad=True) y_train, y_test = self.get_y_train_and_test_multilabel() return x_train, y_train, x_test, y_test def get_x_train_test_syntax(self, komn, pad=False): x_train, x_test = [], [] for i in range(len(self.ready_tagged)): sc = komn.get_syntactic_concatenation(self.ready_tagged[i]) if pad: sc = pad_array(sc, 80) if i < TRAIN_SENTENCES: x_train.append(sc) else: x_test.append(sc) return np.array(x_test), np.array(x_train) def get_syntax_setences_for_NER(self): pass def format_xml_for_NER(self, komn): def tag_sentences(out, sentences, sentences_with_syntax): for normal, syntax in zip(sentences, sentences_with_syntax): opinions = sentences[normal]['opinions'] tags = {} for o in opinions: words = o['target'].split(' ') cat = o['category'] tags[words[0]] = 'B-' + cat for w in words[1::]: tags[w] = 'I-' + cat sent_s = syntax[0] assert (len( komn.get_syntactic_concatenation(syntax)) == len(sent_s)) for w in sent_s: if w in tags: out.write(w + ' ' + tags[w] + '\n') else: out.write(w + ' ' + 'O\n') out.write('\n') train_out = "NER-ABSA-16_Restaurants_Train.txt" test_out = "NER-ABSA-16_Restaurants_Test.txt" out_train = open(train_out, 'w') out_test = open(test_out, 'w') tag_sentences(out_train, self.ready_train, self.ready_tagged_train) tag_sentences(out_test, self.ready_test, self.ready_tagged_test)
pickle.dump(X, f) with open("y.pickle","wb") as f: pickle.dump(y, f) # Unpickling X and y with open("X.pickle","rb") as f: X = pickle.load(f) with open("y.pickle","rb") as f: y = pickle.load(f) """ # PreProcessing the reviews from TextPreprocessor import TextPreprocessor cleaner = TextPreprocessor() cleaned_reviews = cleaner.clean_text(X) # Creating Tfidf matrix from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(max_features = 3000,max_df=0.65, min_df=50, stop_words=stopwords.words("english")) X = vectorizer.fit_transform(cleaned_reviews).toarray() # words = vectorizer.get_feature_names() # Creating Train and Test Sets from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=272)
def calculate_norm(string_array, lev_matrix, dice_matrix): for iterator in range(len(string_array)): row = string_array[iterator] for index in range(int(iterator + 1 / 2)): lev = Levenshtein.distance(string_array[index], row) lev_matrix[index][iterator] = lev dice = dice_coefficient(string_array[index], row) dice_matrix[index][iterator] = dice if __name__ == '__main__': start = time.time() textProc = TextPreprocessor() l_test = np.array(['sdadadasda', 'sadadadaqweqe', 'sdadadadac']) m_test = np.zeros((3, 3), dtype='int32') m_test2 = np.zeros((3, 3), dtype='float32') calculate_norm(l_test, m_test, m_test2) l = textProc.open_and_filtr_file(lines) m_len = len(l) companies_array = np.asarray(l) lev_matrix = np.zeros((m_len, m_len), dtype='int32') dice_matrix = np.zeros((m_len, m_len), dtype='float32') #calculate_norm(companies_array, lev_matrix, dice_matrix) primary_clusters = create_primary_clusters(clusters)
class InvertedIndex: """ Inverted Index class. """ def __init__(self, db): from TextPreprocessor import TextPreprocessor self.index = dict() self.db = db self.preprocessor = TextPreprocessor() self.output_file = './indexer/inverted_index.txt' def _doc_freq(self, corpus): from gensim import corpora docs = [self.preprocessor.preprocess(doc) for docid, doc in corpus] dictionary = corpora.Dictionary(docs) dfs = {} for key, value in dictionary.token2id.items(): dfs[key] = dictionary.dfs[value] return dfs def doc_freq(self, corpus): from gensim import corpora import pymp pymp.config.nested = False pymp.config.thread_limit = 4 docs = pymp.shared.list() with pymp.Parallel(4) as p: for index in p.range(0, len(corpus)): docs.append(self.preprocessor.preprocess(corpus[index][1])) dictionary = corpora.Dictionary(docs) dfs = {} for key, value in dictionary.token2id.items(): dfs[key] = dictionary.dfs[value] return dfs def build_index(self, document, num_docs, doc_freqs): """ Process a given document, save it to the DocCollection and update the index. """ from Appearence import Appearance terms = self.preprocessor.preprocess(document['text']) appearances_dict = dict() # Dictionary with each term and the frequency it appears in the text. for term in terms: term_frequency = appearances_dict[ term].frequency if term in appearances_dict else 0 idf = self.calc_idf(term, num_docs, doc_freqs[term]) appearances_dict[term] = Appearance(document['id'], term_frequency + 1, idf) update_dict = { key: [appearance] if key not in self.index else self.index[key] + [appearance] for (key, appearance) in appearances_dict.items() } self.index.update(update_dict) # Add the document into the database self.db.add(document) return document def load_db(self, document): self.db.add(document) return document def get_index(self, n): print(list(self.index.items())[:n]) def load_index_from_file(self, file): import json f = open(file) raw = f.read() self.index = json.loads(raw) f.close() def save_index(self): import json from Appearence import AppearanceEncoder with open(self.output_file, 'w') as file: file.write(json.dumps(self.index, cls=AppearanceEncoder)) def calc_idf(self, term, num_docs, doc_freq): import numpy as np idf = np.log((1 + num_docs) / (1 + doc_freq)) + 1 return idf def lookup_query(self, query): """ Returns the dictionary of terms with their correspondent Appearances. This is a very naive search since it will just split the terms and show the documents where they appear. """ from operator import itemgetter result = {} for term in query.split(' '): if term in self.index: term_arr = [] for t in self.index[term]: try: term_arr.append((t.docId, t.tfidf)) except: term_arr.append((t['docId'], t['tfidf'])) result.update({term: sorted(term_arr, key=itemgetter(1))}) return result
def main(model_module_path, model_path, word_embedding_path, sentiment_embedding_path): """ This script loads a model definition script and its trained model, as well as embedding dictionaries. It then makes predictions on arguments, which are currently defined inside the script (feel free to load them from a file.). """ model_module_path = Path(model_module_path) model_path = Path(model_path) model = keras.models.load_model(model_path) # Load embedding dictionaries word_embedding_npz = np.load(word_embedding_path, allow_pickle=True) word_embedding_dictionary = embedder.tokens2dict( word_embedding_npz['tokens']) sentiment_embedding_npz = np.load(sentiment_embedding_path, allow_pickle=True) sentiment_embedding_dictionary = embedder.tokens2dict( sentiment_embedding_npz['tokens']) model_module = importlib.import_module(model_module_path.stem) # Get a function mapping from data to model if hasattr(model_module, 'Adaptor'): adaptor = model_module.Adaptor( word_embedding_dictionary=word_embedding_dictionary, sentiment_embedding_dictionary=sentiment_embedding_dictionary, inference=True) else: raise ValueError("No adaptor found !") # Load and prepare arguments. arguments = [ ('Banana is the best fruit.', 'They are full of potassium, which helps with cramps and is good for your health.' ), ('Banana is the best fruit.', 'Bananas are yellow. Yellow is bad, I hate it. Therefore bananas are horrible.' ), ] premises = map(operator.itemgetter(1), arguments) conclusions = map(operator.itemgetter(0), arguments) txt_preprocessor = TextPreprocessor() premises = txt_preprocessor.transform(premises) conclusions = txt_preprocessor.transform(conclusions) df_premises = pd.DataFrame(premises) df_premises = df_premises.rename( { column_name: 'premise_' + column_name for column_name in df_premises.columns }, axis='columns') df_conclusions = pd.DataFrame(conclusions) df_conclusions = df_conclusions.rename( { column_name: 'conclusion_' + column_name for column_name in df_conclusions.columns }, axis='columns') df_arguments = pd.concat([df_premises, df_conclusions], axis='columns') argument_generator = map( adaptor, ml_gen.generator_DataFrames([lambda: df_arguments], batch_size=None)) predictions = model.predict_generator(argument_generator, steps=1) print(predictions)
def __init__(self, db): from TextPreprocessor import TextPreprocessor self.index = dict() self.db = db self.preprocessor = TextPreprocessor() self.output_file = './indexer/inverted_index.txt'
def setUp(self): self.tpp = TextPreprocessor()
class MLClassifier: def __init__(self, stopwords_file, organisations_file, org_ratio, org_headersize): self.tp = TextPreprocessor(stopwords_file) self.fe = FeatureExtractor(organisations_file, self.tp, org_ratio, org_headersize) self.trie = "" self.weights = "" self.org_classifier = "" self.respa_classifier = "" self.respa_used_features = [ 'MatchedPatternsCount', 'UnqMatchedPatternsCount', 'TotalMatchingCharacters', 'SumMatchingEntries', 'SumMatchingEntriesLength', 'MatchingUnigrams', 'MatchingBigrams', 'TotalMatchingRatio', 'UnqMatchingUnigrams', 'UnqMatchingBigrams', 'AlphaBulletDot', 'AlphaBulletPar', 'AlphaBulletCapDot', 'AlphaBulletCapPar', 'DigitBulletDot', 'DigitBulletPar' ] self.org_used_features = [ 'BulletDepartment', 'OrgTotalMatchingCharacters', 'OrgMatchingUnigrams', 'OrgMatchingBigrams', 'AlphaBulletDot', 'AlphaBulletPar', 'AlphaBulletCapDot', 'AlphaBulletCapPar', 'DigitBulletDot', 'DigitBulletPar' ] def read_org_trie_from_file(self, org_pickle_file): self.fe.read_org_trie_from_file(org_pickle_file) def read_trie_index_from_file(self, trie): self.trie = pickle.load(open(trie, "rb")) #pickle/1trie.pkl def read_weights_from_file(self, weightfile): self.weights = pickle.load(open(weightfile, "rb")) #pickle/1weights.pkl def org_classifier_from_file(self, filename): org_train_data = self.fe.extract_organisational_features_from_file( filename) org_classifier = svm.SVC(C=1, gamma='auto') org_classifier.fit(org_train_data[self.org_used_features].values, org_train_data['Class']) self.org_classifier = org_classifier def update_org_classifier(self, oldfile, newfile, headersize): org_train_data = self.fe.update_organisational_features_from_file( oldfile, newfile, self.weights, self.trie, self.tp, headersize) org_classifier = svm.SVC(C=1, gamma='auto') org_classifier.fit(org_train_data[self.org_used_features].values, org_train_data['Class']) self.org_classifier = org_classifier def respa_classifier_from_file(self, filename, headersize): train_data = self.fe.extract_features_from_file( filename, self.weights, self.trie, self.tp, headersize) #ExampleFile.csv train_data['TotalMatchingRatio'] = train_data.apply( lambda row: (row.TotalMatchingCharacters / len(row.StemmedParagraph) if len(row.StemmedParagraph) != 0 else 0), axis=1) respa_classifier = svm.SVC(C=1, gamma='auto') respa_classifier.fit(train_data[self.respa_used_features].values, train_data['Class']) self.respa_classifier = respa_classifier def predict_pdf_file(self, filename, respa_headersize, org_headersize, pdf_directory, out_txt_directory): paragraphs_per_article = pd.DataFrame() regexAlphaDot = re.compile('^([a-z]|[α-ω]){1,4}(\.)') regexAlphaPar = re.compile('^([a-z]|[α-ω]){1,4}(\))') regexAlphaCapDot = re.compile('^([Α-Ω]|[A-Z]){1,4}(\.)') regexAlphaCapPar = re.compile('^([Α-Ω]|[A-Z]){1,4}(\))') regexNumDot = re.compile('^([0-9]){1,4}(\.)') regexNumPar = re.compile('^([0-9]){1,4}(\))') regexDep = re.compile( '^([a-z]|[A-Z]|[Α-Ω]|[α-ω]|[0-9]){1,4}(\.|\))(\sΤμήμα)') parser = Parser() txt = parser.get_txt( filename.replace(".pdf", ""), pdf_directory, out_txt_directory ) #/home/latex/Downloads/gsoc2018-GG-extraction-master/src/respa_feks/ -- /home/latex/Desktop/respa_feks_txt/ txt = re.sub( '([0-9]){1,4}(\s\n)ΕΦΗΜΕΡΙΣ ΤΗΣ ΚΥΒΕΡΝΗΣΕΩΣ \(ΤΕΥΧΟΣ ΠΡΩΤΟ\)(\s\n)', '', txt) articles = parser.get_articles(txt) for num, article in articles.items(): articleNo = num article_paragraphs = parser.get_paragraphs(article) isPrevAlphaBulletDot = 0 isPrevAlphaBulletPar = 0 isPrevAlphaBulletCapDot = 0 isPrevAlphaBulletCapPar = 0 isPrevDigitBulletDot = 0 isPrevDigitBulletPar = 0 # =========== code block below splits paragraphs based on bullets =========== trimmed_paragraphs = [] for p in article_paragraphs: sublist = list( filter(lambda x: len(x) > 1, re.split('((\n)([α-ω])([α-ω])(\)))', p))) if (len(sublist) > 1): if len(sublist[0]) <= 3: for x in range(0, len(sublist) - 1, 2): trimmed_paragraphs.append(sublist[x] + sublist[x + 1]) if len(sublist) % 2 != 0: trimmed_paragraphs.append(sublist[len(sublist) - 1]) else: trimmed_paragraphs.append(sublist[0]) for x in range(1, len(sublist) - 1, 2): trimmed_paragraphs.append(sublist[x] + sublist[x + 1]) if len(sublist) % 2 == 0: trimmed_paragraphs.append(sublist[len(sublist) - 1]) else: trimmed_paragraphs.append(p) # =========== code block above splits paragraphs based on bullets =========== for raw_paragraph in trimmed_paragraphs: stemmed_paragraph = self.tp.getCleanText( raw_paragraph, respa_headersize) words_in_capital = self.tp.get_words_in_capital(raw_paragraph) first_capital_word_offset = self.tp.get_first_word_in_capital_offset( raw_paragraph) isAlphaBulletDot = self.fe.regex_applies( regexAlphaDot, raw_paragraph) isAlphaBulletPar = self.fe.regex_applies( regexAlphaPar, raw_paragraph) isAlphaBulletCapDot = self.fe.regex_applies( regexAlphaCapDot, raw_paragraph) isAlphaBulletCapPar = self.fe.regex_applies( regexAlphaCapPar, raw_paragraph) isDigitBulletDot = self.fe.regex_applies( regexNumDot, raw_paragraph) isDigitBulletPar = self.fe.regex_applies( regexNumPar, raw_paragraph) isDep = self.fe.regex_applies(regexDep, raw_paragraph) all_patterns = list( self.trie.search_all_patterns(stemmed_paragraph)) # === Code block below removes unigrams that are contained in bigrams === subpatterns = utils.remove_unigrams_contained_in_bigrams( all_patterns) # === Code block above removes unigrams that are contained in bigrams === pattern_features = self.fe.extract_features_from_trie_patterns( subpatterns, self.weights) organisational_features = self.fe.extract_organisational_features( self.tp.getCleanText(raw_paragraph, org_headersize)) paragraphs_per_article = paragraphs_per_article.append( { 'Filename': filename, 'Class': 'NoIdea', 'UnqMatchedPatternsCount': pattern_features['UnqMatchedPatternsCount'], 'BulletDepartment': isDep, 'MatchedPatternsCount': pattern_features['MatchedPatternsCount'], 'ArticleNo': articleNo, 'OrgTotalMatchingCharacters': organisational_features['OrgTotalMatchingCharacters'], 'OrgMatchingUnigrams': organisational_features['OrgMatchingUnigrams'], 'OrgMatchingBigrams': organisational_features['OrgMatchingBigrams'], 'TotalMatchingCharacters': pattern_features['TotalMatchingCharacters'], 'LongestMatchingPattern': pattern_features['LongestMatchingPattern'], 'SumMatchingEntries': pattern_features['SumMatchingEntries'], 'SumMatchingEntriesLength': pattern_features['SumMatchingEntriesLength'], 'MatchingUnigrams': pattern_features['MatchingUnigrams'], 'MatchingBigrams': pattern_features['MatchingBigrams'], 'AlphaBulletDot': isAlphaBulletDot, 'AlphaBulletPar': isAlphaBulletPar, 'AlphaBulletCapDot': isAlphaBulletCapDot, 'AlphaBulletCapPar': isAlphaBulletCapPar, 'DigitBulletDot': isDigitBulletDot, 'DigitBulletPar': isDigitBulletPar, 'PrevAlphaBulletDot': isPrevAlphaBulletDot, 'PrevAlphaBulletPar': isPrevAlphaBulletPar, 'PrevAlphaBulletCapDot': isPrevAlphaBulletCapDot, 'PrevAlphaBulletCapPar': isPrevAlphaBulletCapPar, 'PrevDigitBulletDot': isPrevDigitBulletDot, 'PrevDigitBulletPar': isPrevDigitBulletPar, 'RawParagraph': raw_paragraph, 'StemmedParagraph': stemmed_paragraph, 'StemmedParagraphLength': len(stemmed_paragraph), 'RawParagraphLength': len(raw_paragraph), 'FirstPatternOffset': pattern_features['FirstPatternOffset'], 'WordsInCapital': words_in_capital, 'FirstWordInCapitalOffset': first_capital_word_offset, 'UnqMatchingUnigrams': pattern_features['UnqMatchingUnigrams'], 'UnqMatchingBigrams': pattern_features['UnqMatchingBigrams'] }, ignore_index=True) isPrevAlphaBulletDot = isAlphaBulletDot isPrevAlphaBulletPar = isAlphaBulletPar isPrevAlphaBulletCapDot = isAlphaBulletCapDot isPrevAlphaBulletCapPar = isAlphaBulletCapPar isPrevDigitBulletDot = isDigitBulletDot isPrevDigitBulletPar = isDigitBulletPar paragraphs_per_article[ 'TotalMatchingRatio'] = paragraphs_per_article.apply( lambda row: (row.TotalMatchingCharacters / len(row.StemmedParagraph) if len(row.StemmedParagraph) != 0 else 0), axis=1) respa_prediction = self.respa_classifier.predict( paragraphs_per_article[self.respa_used_features]) org_prediction = self.org_classifier.predict( paragraphs_per_article[self.org_used_features]) paragraphs_per_article['RespAPrediction'] = pd.Series(respa_prediction) paragraphs_per_article['OrgPrediction'] = pd.Series(org_prediction) paragraphs_per_article['Prediction'] = paragraphs_per_article.apply( lambda row: utils.total_prediction(row), axis=1) adj_predictions = pd.DataFrame() previous_prediction = '' for index, row in paragraphs_per_article.iterrows(): if re.search( r'\b(ΠΕ|ΤΕ|ΔΕ|ΥΕ)\b', row['RawParagraph']) and re.search( r'\b(Κλάδος|θέση|θέσεις|θέσεων|Υπάλληλος)\b', row['RawParagraph'], re.IGNORECASE) or re.search( r'\b(θέση|θέσεις|θέσεων)\b', row['RawParagraph'], re.IGNORECASE) and re.search( r'\bΚλάδος\b', row['RawParagraph'], re.IGNORECASE): row['Prediction'] = 'Positions' elif row['AlphaBulletDot'] == row['PrevAlphaBulletDot'] and row[ 'AlphaBulletPar'] == row['PrevAlphaBulletPar'] and row[ 'AlphaBulletCapDot'] == row['PrevAlphaBulletCapDot'] and row[ 'AlphaBulletCapPar'] == row[ 'PrevAlphaBulletCapPar'] and row[ 'DigitBulletDot'] == row[ 'PrevDigitBulletDot'] and row[ 'DigitBulletPar'] == row[ 'PrevDigitBulletPar'] and previous_prediction != '': row['Prediction'] = previous_prediction adj_predictions = adj_predictions.append(row, ignore_index=True) lines = row['RawParagraph'].splitlines() total_line_len = reduce((lambda x, y: x + y), list(map(lambda l: len(l), lines))) line_ratio = total_line_len / len(lines) if (line_ratio <= 4): row['Prediction'] = 'Irrelevant' previous_prediction = row['Prediction'] adj_predictions[['ArticleNo', 'RawParagraph', 'Prediction']].to_csv(filename + '.csv', sep='\t') return adj_predictions def respa_classifier_from_pdf_files(self, respa_directory, headersize1, non_respa_directory, headersize2, ratio, create_trie): respas_p_df = self.tp.getParagraphsFromFolder(respa_directory, headersize1) respas = self.tp.getTermFrequency(list( respas_p_df['StemmedParagraph'])) most_frequent_respas_stems_ordered = respas[0] weights = respas[1] non_respas_p_df = self.tp.getParagraphsFromFolder( non_respa_directory, headersize2) non_respas = self.tp.getTermFrequency( list(non_respas_p_df['StemmedParagraph'])) most_frequent_non_respas_stems_ordered = non_respas[0] num_non_respa_docs = len(non_respas_p_df.index) if (create_trie): self.trie = utils.create_trie_index( most_frequent_non_respas_stems_ordered, most_frequent_respas_stems_ordered, num_non_respa_docs, ratio, self.tp) df_train_respa = pd.DataFrame() regexAlphaDot = re.compile('^([a-z]|[α-ω]){1,4}(\.)') regexAlphaPar = re.compile('^([a-z]|[α-ω]){1,4}(\))') regexAlphaCapDot = re.compile('^([Α-Ω]|[A-Z]){1,4}(\.)') regexAlphaCapPar = re.compile('^([Α-Ω]|[A-Z]){1,4}(\))') regexNumDot = re.compile('^([0-9]){1,4}(\.)') regexNumPar = re.compile('^([0-9]){1,4}(\))') regexDep = re.compile( '^([a-z]|[A-Z]|[Α-Ω]|[α-ω]|[0-9]){1,4}(\.|\))(\sΤμήμα)') for index, row in respas_p_df.iterrows(): raw_paragraph = row['RawParagraph'] stemmed_paragraph = row['StemmedParagraph'] words_in_capital = self.tp.get_words_in_capital(raw_paragraph) first_capital_word_offset = self.tp.get_first_word_in_capital_offset( raw_paragraph) isAlphaBulletDot = self.fe.regex_applies(regexAlphaDot, raw_paragraph) isAlphaBulletPar = self.fe.regex_applies(regexAlphaPar, raw_paragraph) isAlphaBulletCapDot = self.fe.regex_applies( regexAlphaCapDot, raw_paragraph) isAlphaBulletCapPar = self.fe.regex_applies( regexAlphaCapPar, raw_paragraph) isDigitBulletDot = self.fe.regex_applies(regexNumDot, raw_paragraph) isDigitBulletPar = self.fe.regex_applies(regexNumPar, raw_paragraph) isDep = self.fe.regex_applies(regexDep, raw_paragraph) all_patterns = list( self.trie.search_all_patterns(stemmed_paragraph)) # === Code block below removes unigrams that are contained in bigrams === subpatterns = utils.remove_unigrams_contained_in_bigrams( all_patterns) # === Code block above removes unigrams that are contained in bigrams === pattern_features = self.fe.extract_features_from_trie_patterns( subpatterns, weights) organisational_features = self.fe.extract_organisational_features( stemmed_paragraph) df_train_respa = df_train_respa.append( { 'Class': 'RespA', 'UnqMatchedPatternsCount': pattern_features['UnqMatchedPatternsCount'], 'MatchedPatternsCount': pattern_features['MatchedPatternsCount'], 'BulletDepartment': isDep, 'OrgTotalMatchingCharacters': organisational_features['OrgTotalMatchingCharacters'], 'OrgMatchingUnigrams': organisational_features['OrgMatchingUnigrams'], 'OrgMatchingBigrams': organisational_features['OrgMatchingBigrams'], 'TotalMatchingCharacters': pattern_features['TotalMatchingCharacters'], 'LongestMatchingPattern': pattern_features['LongestMatchingPattern'], 'SumMatchingEntries': pattern_features['SumMatchingEntries'], 'SumMatchingEntriesLength': pattern_features['SumMatchingEntriesLength'], 'MatchingUnigrams': pattern_features['MatchingUnigrams'], 'MatchingBigrams': pattern_features['MatchingBigrams'], 'AlphaBulletDot': isAlphaBulletDot, 'AlphaBulletPar': isAlphaBulletPar, 'AlphaBulletCapDot': isAlphaBulletCapDot, 'AlphaBulletCapPar': isAlphaBulletCapPar, 'DigitBulletDot': isDigitBulletDot, 'DigitBulletPar': isDigitBulletPar, 'RawParagraph': raw_paragraph, 'StemmedParagraph': stemmed_paragraph, 'StemmedParagraphLength': len(stemmed_paragraph), 'RawParagraphLength': len(raw_paragraph), 'FirstPatternOffset': pattern_features['FirstPatternOffset'], 'WordsInCapital': words_in_capital, 'FirstWordInCapitalOffset': first_capital_word_offset, 'UnqMatchingUnigrams': pattern_features['UnqMatchingUnigrams'], 'UnqMatchingBigrams': pattern_features['UnqMatchingBigrams'] }, ignore_index=True) for index, row in non_respas_p_df.iterrows(): raw_paragraph = row['RawParagraph'] stemmed_paragraph = row['StemmedParagraph'] words_in_capital = self.tp.get_words_in_capital(raw_paragraph) first_capital_word_offset = self.tp.get_first_word_in_capital_offset( raw_paragraph) isAlphaBulletDot = self.fe.regex_applies(regexAlphaDot, raw_paragraph) isAlphaBulletPar = self.fe.regex_applies(regexAlphaPar, raw_paragraph) isAlphaBulletCapDot = self.fe.regex_applies( regexAlphaCapDot, raw_paragraph) isAlphaBulletCapPar = self.fe.regex_applies( regexAlphaCapPar, raw_paragraph) isDigitBulletDot = self.fe.regex_applies(regexNumDot, raw_paragraph) isDigitBulletPar = self.fe.regex_applies(regexNumPar, raw_paragraph) isDep = self.fe.regex_applies(regexDep, raw_paragraph) all_patterns = list( self.trie.search_all_patterns(stemmed_paragraph)) # === Code block below removes unigrams that are contained in bigrams === subpatterns = utils.remove_unigrams_contained_in_bigrams( all_patterns) # === Code block above removes unigrams that are contained in bigrams === pattern_features = self.fe.extract_features_from_trie_patterns( subpatterns, weights) organisational_features = self.fe.extract_organisational_features( stemmed_paragraph) df_train_respa = df_train_respa.append( { 'Class': 'Non-RespA', 'UnqMatchedPatternsCount': pattern_features['UnqMatchedPatternsCount'], 'MatchedPatternsCount': pattern_features['MatchedPatternsCount'], 'BulletDepartment': isDep, 'OrgTotalMatchingCharacters': organisational_features['OrgTotalMatchingCharacters'], 'OrgMatchingUnigrams': organisational_features['OrgMatchingUnigrams'], 'OrgMatchingBigrams': organisational_features['OrgMatchingBigrams'], 'TotalMatchingCharacters': pattern_features['TotalMatchingCharacters'], 'LongestMatchingPattern': pattern_features['LongestMatchingPattern'], 'SumMatchingEntries': pattern_features['SumMatchingEntries'], 'SumMatchingEntriesLength': pattern_features['SumMatchingEntriesLength'], 'MatchingUnigrams': pattern_features['MatchingUnigrams'], 'MatchingBigrams': pattern_features['MatchingBigrams'], 'AlphaBulletDot': isAlphaBulletDot, 'AlphaBulletPar': isAlphaBulletPar, 'AlphaBulletCapDot': isAlphaBulletCapDot, 'AlphaBulletCapPar': isAlphaBulletCapPar, 'DigitBulletDot': isDigitBulletDot, 'DigitBulletPar': isDigitBulletPar, 'RawParagraph': raw_paragraph, 'StemmedParagraph': stemmed_paragraph, 'StemmedParagraphLength': len(stemmed_paragraph), 'RawParagraphLength': len(raw_paragraph), 'FirstPatternOffset': pattern_features['FirstPatternOffset'], 'WordsInCapital': words_in_capital, 'FirstWordInCapitalOffset': first_capital_word_offset, 'UnqMatchingUnigrams': pattern_features['UnqMatchingUnigrams'], 'UnqMatchingBigrams': pattern_features['UnqMatchingBigrams'] }, ignore_index=True) df_train_respa['TotalMatchingRatio'] = df_train_respa.apply( lambda row: row.TotalMatchingCharacters / len(row.StemmedParagraph ), axis=1) self.respa_classifier = svm.SVC(C=1, gamma='auto') self.respa_classifier.fit( df_train_respa[self.respa_used_features].values, df_train_respa['Class']) df_train_respa.to_csv("training_file.csv", sep="\t") return df_train_respa def classifier_from_enriched_train_samples(self, oldfile, newfile, headersize1, headersize2, ratio): old_train_data = self.fe.read_training_file(oldfile)[[ 'Class', 'StemmedParagraph', 'RawParagraph' ]] new_train_data_respa = self.fe.extract_features_from_file( newfile, self.weights, self.trie, self.tp, headersize1)[['Class', 'StemmedParagraph', 'RawParagraph']] new_train_data_non_respa = self.fe.extract_features_from_file( newfile, self.weights, self.trie, self.tp, headersize2)[['Class', 'StemmedParagraph', 'RawParagraph']] merged_df = old_train_data.append(new_train_data_respa).append( new_train_data_non_respa) merged_df = merged_df.reset_index(drop=True) isRespA = merged_df['Class'] == 'RespA' isNonRespA = merged_df['Class'] == 'Non-RespA' respas_p_df = merged_df[isRespA][['StemmedParagraph', 'RawParagraph']] non_respas_p_df = merged_df[isNonRespA][[ 'StemmedParagraph', 'RawParagraph' ]] #respas_p_df = self.tp.getParagraphsFromFolder(respa_directory,headersize1) #stemmed, raw respas = self.tp.getTermFrequency(list( respas_p_df['StemmedParagraph'])) most_frequent_respas_stems_ordered = respas[0] weights = respas[1] #non_respas_p_df = self.tp.getParagraphsFromFolder(non_respa_directory,headersize2) non_respas = self.tp.getTermFrequency( list(non_respas_p_df['StemmedParagraph'])) most_frequent_non_respas_stems_ordered = non_respas[0] num_non_respa_docs = len(non_respas_p_df.index) self.trie = utils.create_trie_index( most_frequent_non_respas_stems_ordered, most_frequent_respas_stems_ordered, num_non_respa_docs, ratio, self.tp) df_train_respa = pd.DataFrame() regexAlphaDot = re.compile('^([a-z]|[α-ω]){1,4}(\.)') regexAlphaPar = re.compile('^([a-z]|[α-ω]){1,4}(\))') regexAlphaCapDot = re.compile('^([Α-Ω]|[A-Z]){1,4}(\.)') regexAlphaCapPar = re.compile('^([Α-Ω]|[A-Z]){1,4}(\))') regexNumDot = re.compile('^([0-9]){1,4}(\.)') regexNumPar = re.compile('^([0-9]){1,4}(\))') regexDep = re.compile( '^([a-z]|[A-Z]|[Α-Ω]|[α-ω]|[0-9]){1,4}(\.|\))(\sΤμήμα)') for index, row in respas_p_df.iterrows(): raw_paragraph = row['RawParagraph'] stemmed_paragraph = row['StemmedParagraph'] words_in_capital = self.tp.get_words_in_capital(raw_paragraph) first_capital_word_offset = self.tp.get_first_word_in_capital_offset( raw_paragraph) isAlphaBulletDot = self.fe.regex_applies(regexAlphaDot, raw_paragraph) isAlphaBulletPar = self.fe.regex_applies(regexAlphaPar, raw_paragraph) isAlphaBulletCapDot = self.fe.regex_applies( regexAlphaCapDot, raw_paragraph) isAlphaBulletCapPar = self.fe.regex_applies( regexAlphaCapPar, raw_paragraph) isDigitBulletDot = self.fe.regex_applies(regexNumDot, raw_paragraph) isDigitBulletPar = self.fe.regex_applies(regexNumPar, raw_paragraph) isDep = self.fe.regex_applies(regexDep, raw_paragraph) all_patterns = list( self.trie.search_all_patterns(stemmed_paragraph)) # === Code block below removes unigrams that are contained in bigrams === subpatterns = utils.remove_unigrams_contained_in_bigrams( all_patterns) # === Code block above removes unigrams that are contained in bigrams === pattern_features = self.fe.extract_features_from_trie_patterns( subpatterns, weights) organisational_features = self.fe.extract_organisational_features( stemmed_paragraph) df_train_respa = df_train_respa.append( { 'Class': 'RespA', 'UnqMatchedPatternsCount': pattern_features['UnqMatchedPatternsCount'], 'MatchedPatternsCount': pattern_features['MatchedPatternsCount'], 'BulletDepartment': isDep, 'OrgTotalMatchingCharacters': organisational_features['OrgTotalMatchingCharacters'], 'OrgMatchingUnigrams': organisational_features['OrgMatchingUnigrams'], 'OrgMatchingBigrams': organisational_features['OrgMatchingBigrams'], 'TotalMatchingCharacters': pattern_features['TotalMatchingCharacters'], 'LongestMatchingPattern': pattern_features['LongestMatchingPattern'], 'SumMatchingEntries': pattern_features['SumMatchingEntries'], 'SumMatchingEntriesLength': pattern_features['SumMatchingEntriesLength'], 'MatchingUnigrams': pattern_features['MatchingUnigrams'], 'MatchingBigrams': pattern_features['MatchingBigrams'], 'AlphaBulletDot': isAlphaBulletDot, 'AlphaBulletPar': isAlphaBulletPar, 'AlphaBulletCapDot': isAlphaBulletCapDot, 'AlphaBulletCapPar': isAlphaBulletCapPar, 'DigitBulletDot': isDigitBulletDot, 'DigitBulletPar': isDigitBulletPar, 'RawParagraph': raw_paragraph, 'StemmedParagraph': stemmed_paragraph, 'StemmedParagraphLength': len(stemmed_paragraph), 'RawParagraphLength': len(raw_paragraph), 'FirstPatternOffset': pattern_features['FirstPatternOffset'], 'WordsInCapital': words_in_capital, 'FirstWordInCapitalOffset': first_capital_word_offset, 'UnqMatchingUnigrams': pattern_features['UnqMatchingUnigrams'], 'UnqMatchingBigrams': pattern_features['UnqMatchingBigrams'] }, ignore_index=True) for index, row in non_respas_p_df.iterrows(): raw_paragraph = row['RawParagraph'] stemmed_paragraph = row['StemmedParagraph'] words_in_capital = self.tp.get_words_in_capital(raw_paragraph) first_capital_word_offset = self.tp.get_first_word_in_capital_offset( raw_paragraph) isAlphaBulletDot = self.fe.regex_applies(regexAlphaDot, raw_paragraph) isAlphaBulletPar = self.fe.regex_applies(regexAlphaPar, raw_paragraph) isAlphaBulletCapDot = self.fe.regex_applies( regexAlphaCapDot, raw_paragraph) isAlphaBulletCapPar = self.fe.regex_applies( regexAlphaCapPar, raw_paragraph) isDigitBulletDot = self.fe.regex_applies(regexNumDot, raw_paragraph) isDigitBulletPar = self.fe.regex_applies(regexNumPar, raw_paragraph) isDep = self.fe.regex_applies(regexDep, raw_paragraph) all_patterns = list( self.trie.search_all_patterns(stemmed_paragraph)) # === Code block below removes unigrams that are contained in bigrams === subpatterns = utils.remove_unigrams_contained_in_bigrams( all_patterns) # === Code block above removes unigrams that are contained in bigrams === pattern_features = self.fe.extract_features_from_trie_patterns( subpatterns, weights) organisational_features = self.fe.extract_organisational_features( stemmed_paragraph) df_train_respa = df_train_respa.append( { 'Class': 'Non-RespA', 'UnqMatchedPatternsCount': pattern_features['UnqMatchedPatternsCount'], 'MatchedPatternsCount': pattern_features['MatchedPatternsCount'], 'BulletDepartment': isDep, 'OrgTotalMatchingCharacters': organisational_features['OrgTotalMatchingCharacters'], 'OrgMatchingUnigrams': organisational_features['OrgMatchingUnigrams'], 'OrgMatchingBigrams': organisational_features['OrgMatchingBigrams'], 'TotalMatchingCharacters': pattern_features['TotalMatchingCharacters'], 'LongestMatchingPattern': pattern_features['LongestMatchingPattern'], 'SumMatchingEntries': pattern_features['SumMatchingEntries'], 'SumMatchingEntriesLength': pattern_features['SumMatchingEntriesLength'], 'MatchingUnigrams': pattern_features['MatchingUnigrams'], 'MatchingBigrams': pattern_features['MatchingBigrams'], 'AlphaBulletDot': isAlphaBulletDot, 'AlphaBulletPar': isAlphaBulletPar, 'AlphaBulletCapDot': isAlphaBulletCapDot, 'AlphaBulletCapPar': isAlphaBulletCapPar, 'DigitBulletDot': isDigitBulletDot, 'DigitBulletPar': isDigitBulletPar, 'RawParagraph': raw_paragraph, 'StemmedParagraph': stemmed_paragraph, 'StemmedParagraphLength': len(stemmed_paragraph), 'RawParagraphLength': len(raw_paragraph), 'FirstPatternOffset': pattern_features['FirstPatternOffset'], 'WordsInCapital': words_in_capital, 'FirstWordInCapitalOffset': first_capital_word_offset, 'UnqMatchingUnigrams': pattern_features['UnqMatchingUnigrams'], 'UnqMatchingBigrams': pattern_features['UnqMatchingBigrams'] }, ignore_index=True) df_train_respa['TotalMatchingRatio'] = df_train_respa.apply( lambda row: row.TotalMatchingCharacters / len(row.StemmedParagraph ), axis=1) self.respa_classifier = svm.SVC(C=1, gamma='auto') self.respa_classifier.fit( df_train_respa[self.respa_used_features].values, df_train_respa['Class']) return df_train_respa