def processFile(self, file_path_and_name): try: f = open(file_path_and_name, 'r') text_0 = f.read() text_1 = re.search(r"<TEXT>.*</TEXT>", text_0, re.DOTALL) text_1 = re.sub("<TEXT>\n", "", text_1.group(0)) text_1 = re.sub("\n</TEXT>", "", text_1) text_1 = re.sub("<P>", "", text_1) text_1 = re.sub("</P>", "", text_1) text_1 = re.sub("\n", " ", text_1) text_1 = re.sub("\"", "\"", text_1) text_1 = re.sub("''", "\"", text_1) text_1 = re.sub("``", "\"", text_1) text_1 = re.sub(" +", " ", text_1) text_1 = re.sub(" _ ", "", text_1) text_1 = re.sub(r"\(AP\) _", " ", text_1) text_1 = re.sub("&\w+;", " ", text_1) sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') lines = sent_tokenizer.tokenize(text_1.strip()) index = lines[0].find("--") if index != -1: lines[0] = lines[0][index + 2:] index = lines[0].find(" _ ") if index != -1: lines[0] = lines[0][index + 3:] sentences = [] for sent in lines: sent = sent.strip() OG_sent = sent[:] sent = sent.lower() line = nltk.word_tokenize(sent) stemmed_sentence = [porter.stem(word) for word in line] stemmed_sentence = list(filter(lambda x: x != '.' and x != '`' and x != ',' and x != '_' and x != ';' and x != '(' and x != ')' and x.find('&') == -1 and x != '?' and x != "'" and x != '!' and x != '''"''' and x != '``' and x != '--' and x != ':' and x != "''" and x != "'s", stemmed_sentence)) # stemmed_sentence = [word for word in stemmed_sentence if word not in stopwords.words('english')] if (len(stemmed_sentence) <= 4): continue if stemmed_sentence: sentences.append(Sentence(file_path_and_name, stemmed_sentence, OG_sent)) return sentences except IOError: print('Oops! File not found', file_path_and_name) return [Sentence(file_path_and_name, [], [])]
def _read_file(src_file): data = [] lemma_count = {} total_samples = 0 src = open(src_file, "rt", encoding="utf-8") # init lists words = [] lemma_words = [] is_prep = [] tree = [] for row in src: if row == "\n": # create sentence object data.append(Sentence(words, lemma_words, is_prep, tree)) # init lists words = [] lemma_words = [] is_prep = [] tree = [] continue # read file tree_id, word, lemma, _, pos, _, parent_idx, context, _, _ = row.split() total_samples += 1 lemma_count[lemma] = lemma_count.get(lemma, 0) + 1 # fill list with relevant data from the file words.append(word) lemma_words.append(lemma) is_prep.append(True if pos in PREP else False) tree.append((int(parent_idx) - 1, context)) return total_samples, lemma_count, data
def run_processors(self): results_target = [] results_reference = [] sentences_target = [] sentences_reference = [] selected_names = loads(self.config.get('Processors', 'processors')) selected_processors = [] existing_processors = {} processors_with_output = [] for name, my_class in inspect.getmembers(processors): existing_processors[name] = my_class for proc in selected_names: name_class = (proc, existing_processors[proc]) selected_processors.append(name_class) for name, my_class in selected_processors: instance = my_class() from_file = False if self.config.has_option('Processors', 'from_file'): if instance.__class__.__name__ in loads(self.config.get('Processors', 'from_file')): from_file = True print('Running ' + instance.get_name()) instance.run(self.config, from_file=from_file) print('Getting ' + instance.get_name()) instance.get(self.config, from_file=from_file) print(instance.get_name() + ' ' + 'finished!') if instance.get_output() is not None: processors_with_output.append((name, my_class)) results_target.append(instance.get_result_tgt()) results_reference.append(instance.get_result_ref()) for i in range(len(results_target[0])): my_sentence_tgt = Sentence() my_sentence_ref = Sentence() for k, (name, my_class) in enumerate(processors_with_output): instance = my_class() if instance.get_output() is not None: my_sentence_tgt.add_data(instance.get_name(), results_target[k][i]) my_sentence_ref.add_data(instance.get_name(), results_reference[k][i]) sentences_target.append(my_sentence_tgt) sentences_reference.append(my_sentence_ref) return [sentences_target, sentences_reference]
def buildQuery(self, sentences, TF_IDF_w, n): scores = list(TF_IDF_w.keys()) scores.sort(reverse=True) i = 0 j = 0 queryWords = [] while (i < n): words = TF_IDF_w[scores[j]] for word in words: queryWords.append(word) i = i + 1 if (i > n): break j = j + 1 return Sentence("query", queryWords, queryWords)
def get_clean_sentence(self, punct_cand, punct_ref, cand, ref): clean_cand = Sentence() clean_ref = Sentence() for method in sorted(cand.keys()): if method == 'alignments': alignments = self.get_clean_alignments(punct_cand, punct_ref, cand, ref) clean_data_cand = alignments clean_data_ref = alignments else: clean_data_cand = self.get_clean_data(cand[method], punct_cand) clean_data_ref = self.get_clean_data(ref[method], punct_ref) clean_cand.add_data(method, clean_data_cand) clean_ref.add_data(method, clean_data_ref) return clean_cand, clean_ref
def sentence2vec(self, sentence): sentence = Sentence(sentence, self.seg) vec_bow = self.dictionary.doc2bow(sentence.get_cuted_sentence()) return self.model[vec_bow]
def set_sentences(self, sentences): self.sentences = [] for i in range(0, len(sentences)): self.sentences.append(Sentence(sentences[i], self.seg, i))