def extract_features(wordseg, ql, qr): sent_l = Sentence() sent_l.raw_form = ql sent_l.base_form = ql wordseg_out = wordseg.segment(ql) sent_l.basic_words = wordseg_out.basic_words sent_r = Sentence() sent_r.raw_form = qr sent_r.base_form = qr wordseg_out = wordseg.segment(qr) sent_r.basic_words = wordseg_out.basic_words l_periph = get_periph(sent_l, qr) r_periph = get_periph(sent_r, ql) feature_dict = {} lexical_features = calc_lexical_features(sent_l, sent_r) feature_dict.update(lexical_features) periph_lexical_features = calc_periph_lexical_features(l_periph, r_periph) feature_dict.update(periph_lexical_features) return feature_dict
def extract_features(wordseg, ql, qr, tfidf_count_hash_vectorModels): sent_l = Sentence() sent_l.raw_form = ql sent_l.base_form = ql wordseg_out = wordseg.segment(ql) sent_l.basic_words = wordseg_out.basic_words feature_dict = {} sent_r = Sentence() sent_r.raw_form = qr sent_r.base_form = qr wordseg_out = wordseg.segment(qr) sent_r.basic_words = wordseg_out.basic_words l_notion = get_notional_tokens(sent_l) r_notion = get_notional_tokens(sent_r) count_tfidf_hash_features = get_tfidf_count_hash_features(sent_l, sent_r, tfidf_count_hash_vectorModels) feature_dict.update(count_tfidf_hash_features) notion_count_tfidf_hash_features = get_tfidf_count_hash_features(l_notion, r_notion, tfidf_count_hash_vectorModels, "notion") feature_dict.update(notion_count_tfidf_hash_features) for k in feature_dict: print(k) return feature_dict
def extract_features(wordseg, ql, qr, sent_word2vec, word_weights, sent_model): sent_l = Sentence() sent_l.raw_form = ql sent_l.base_form = ql wordseg_out = wordseg.segment(ql) sent_l.basic_words = wordseg_out.basic_words sent_r = Sentence() sent_r.raw_form = qr sent_r.base_form = qr wordseg_out = wordseg.segment(qr) sent_r.basic_words = wordseg_out.basic_words l_notion = get_notional_tokens(sent_l) r_notion = get_notional_tokens(sent_r) feature_dict = {} sentvec_features = get_sentvec_features(sent_word2vec, word_weights, sent_model, sent_l, sent_r) feature_dict.update(sentvec_features) sentvec_features = get_sentvec_features(sent_word2vec, word_weights, sent_model, sent_l, sent_r, "notion") feature_dict.update(sentvec_features) for k, value in feature_dict.items(): print(k) print(value) return feature_dict
def extract_features(wordseg, ql, qr, tfidf_count_hash_vectorModels, sent_word2vec, sent_vocab_dict, sent_model): sent_l = Sentence() sent_l.raw_form = ql sent_l.base_form = ql wordseg_out = wordseg.segment(ql) sent_l.basic_words = wordseg_out.basic_words feature_dict = {} sent_r = Sentence() sent_r.raw_form = qr sent_r.base_form = qr wordseg_out = wordseg.segment(qr) sent_r.basic_words = wordseg_out.basic_words lexical_features = calc_lexical_features(sent_l, sent_r) feature_dict.update(lexical_features) count_tfidf_hash_features = get_tfidf_count_hash_features( sent_l, sent_r, tfidf_count_hash_vectorModels) feature_dict.update(count_tfidf_hash_features) sentvec_features = get_sentvec_features(sent_word2vec, sent_vocab_dict, sent_model, sent_l, sent_r) feature_dict.update(sentvec_features) return feature_dict
def parse_expression(self, scope): """Parse an expression. Expressions may contain other forms as well, a expression is the 'other' type of sentence. They must match a known of sentence in the scope. :param scope: The scope the expression is being parsed within. :return: A Sentence.""" token = next(self._token_stream) if isinstance(token, ValueToken): return Sentence([token]) elif not isinstance(token, FirstToken): raise ParseError('Cannot begin a sentence with \"' + repr(token) + '\"') elif 'Define' == token.text: self._token_stream.push_back(token) return self.parse_definition(scope) node = Sentence([token]) part_match = scope.new_matcher() if not part_match.next(token): self._token_stream.push_back(token) raise ParseError('Sentence not matched.', node) for token in self._token_stream: if isinstance(token, FirstToken): self._token_stream.push_back(token) if part_match.next(): node.append(self.parse_expression(scope)) elif node.ends_with_dot() and part_match.has_end(): return node else: raise ParseError('Sentence not matched.', node) elif isinstance(token, WordToken): if part_match.next(token): node.append(token) elif node.ends_with_dot() and part_match.has_end(): self._token_stream.push_back(token) return node else: raise ParseError('Sentence not matched.', node) elif isinstance(token, PeriodToken): if part_match.has_end(): node.append(token) return node else: raise ParseError('Sentence not matched.', node) elif isinstance(token, ValueToken): if part_match.next(): node.append(Sentence([token])) else: raise ParseError('Sentence not matched.', node) else: raise ValueError('Unknown Token Kind: {}'.format(type(token))) if isinstance(node[-1], Sentence) and part_match.has_end(): return node raise ParseError('Sentence not matched.', node)
def get_docs_list(conn, courseid, questionid): """ Parameters: conn: A mysql connection. courseid: String questionid: Integer Return: Integer: number of references Two lists: list of textid and list of doc(class Sentence), the first terms of both are -1 and Sentence reference """ # Get reference of certain courseid and questionid. try: get_ref_sql = "SELECT ref FROM standards WHERE courseid=%s AND questionid=%s" get_ref_cur = conn.cursor() get_ref_cur.execute(get_ref_sql, (courseid, questionid)) refs = get_ref_cur.fetchall() # ref = get_ref_cur.fetchone()[0] except Exception as e: print("Error getting reference of courseid", courseid, "questionid", questionid) print(traceback.print_exc()) finally: get_ref_cur.close() lang = "ch" if courseid == "201英语一" else "en" doc_matrix = [] textids = [] ref_id = -1 for ref in refs: reference = Sentence(text=ref[0], language=lang) reference.preprocess() doc_matrix.append(reference) # add Sentence reference as the first term of doc_matrix textids.append(ref_id) # Use negative numbers as reference textid. ref_id -= 1 # Get all detection text of certain courseid and questionid. detections = None try: get_detection_sql = "SELECT textid, text FROM detection WHERE courseid = %s and questionid = %s" get_detection_cur = conn.cursor() if get_detection_cur.execute(get_detection_sql, (courseid, questionid)): detections = get_detection_cur.fetchall() else: print("No quesion", questionid, "of", courseid, "in DETECTION DB.") except Exception as e: print("Error getting text...", traceback.print_exc()) finally: get_detection_cur.close() # Add all detections into doc_matrix if detections is None: return for dt in detections: textids.append(dt[0]) cur_ans = Sentence(text=dt[1], language=lang) cur_ans.preprocess() doc_matrix.append(cur_ans) return len(refs), textids, doc_matrix
def parse_definition(self, outer_scope): """Parse a definition from the incomming tokens. This is technically a kind of expression, but there are a few special rules that may force it to become seperate. This is temporary as it is just the fastest way I can get this to work. I hope. 'Define Function or variable name. to be Body. .'""" token = next(self._token_stream) if 'Define' != token.text: raise ParseError('Invalid start of definition: ' + str(token)) node = Sentence(token) ptr = outer_scope.new_matcher() if not ptr.next(token): self._token_stream.push_back(item) raise ParseError('Sentence not matched.', node) inner_scope = None for item in self._token_stream: if isinstance(item, FirstToken): self._token_stream.push_back(item) if ptr.next(): if inner_scope is None: signature = self.parse_signature() node.append(signature) inner_scope = outer_scope.new_define_scope(signature) else: node.append(self.parse_expression(inner_scope)) elif node.ends_with_dot() and ptr.has_end(): return node else: raise ParseError('Sentence not matched.', node) elif isinstance(item, WordToken): if ptr.next(item): node.append(item) elif node.ends_with_dot() and ptr.has_end(): self._token_stream.push_back(item) return node else: raise ParseError('Sentence not matched.', node) elif isinstance(item, PeriodToken): if ptr.has_end(): node.append(item) return node else: raise ParseError('Sentence not matched.', node) elif isinstance(item, ValueToken): if ptr.next(): node.append(Sentence(item)) else: raise ParseError('Sentence not matched.', node) else: raise TypeError('Parser.parse_definition: Unexpected type' + str(type(item))) if node.ends_with_dot() and ptr.has_end(): return node raise ParseError('Sentence not matched.', node)
def __iter__(self): with open(self.fr_file) as fp_fr, open(self.en_file) as fp_en: for i, (line_fr, line_en) in enumerate(zip(fp_fr, fp_en)): if self.max_line and i > self.max_line: break fr = Sentence(line_fr.strip(), 'fr', i, self.name) en = Sentence(line_en.strip(), 'en', i, self.name) yield fr, en
def read_sentence(self, filename): # print("________________________________________________________________________________________________read_sentence starts") sentence = Sentence() for line in open(filename): line = line.strip() if line: sentence.add_token(line) elif len(sentence) != 1: yield sentence sentence = Sentence( ) # print("________________________________________________________________________________________________read_sentence ends")
def test_ends_with_dot_period(self): has_period = Sentence( [FirstToken('Short'), WordToken('sentence'), PeriodToken()]) self.assertTrue(has_period.ends_with_dot()) no_period = Sentence([FirstToken('Word')]) self.assertFalse(no_period.ends_with_dot()) super_has_period = Sentence([FirstToken('Run'), has_period]) self.assertTrue(super_has_period.ends_with_dot()) super_no_period = Sentence([FirstToken('Run'), no_period]) self.assertFalse(super_no_period.ends_with_dot())
def get_docs_list(conn, courseid, questionid): """ Parameters: conn: A mysql connection. courseid: String questionid: Integer Return: Two lists: list of textid and list of doc(class Sentence), the first terms of both are -1 and Sentence reference. """ # Get reference of certain courseid and questionid. try: get_ref_sql = "SELECT ref FROM standards WHERE courseid=%s AND questionid=%s" get_ref_cur = conn.cursor() get_ref_cur.execute(get_ref_sql, (courseid, questionid)) ref = get_ref_cur.fetchone()[0] except Exception as e: print("Error getting reference of courseid", courseid, "questionid", questionid) print(traceback.print_exc()) finally: get_ref_cur.close() reference = Sentence(text=ref, language="ch") reference.preprocess() doc_matrix = [reference ] # add Sentence reference as the first term of doc_matrix textids = [-1] # Use -1 as the referece textid # Get all detection text of certain courseid and questionid. try: get_detection_sql = "SELECT textid, text FROM detection WHERE courseid = %s and questionid = %s" get_detection_cur = conn.cursor() if get_detection_cur.execute(get_detection_sql, (courseid, questionid)): detections = get_detection_cur.fetchall() else: detections = None print("No quesion", questionid, "of", courseid, "in DETECTION DB.") except Exception as e: print("Error getting text...", traceback.print_exc()) finally: get_detection_cur.close() # Add all detections into doc_matrix if detections == None: return for dt in detections: textids.append(dt[0]) cur_ans = Sentence(text=dt[1], language="ch") cur_ans.preprocess() doc_matrix.append(cur_ans) return textids, doc_matrix
def extract_features(wordseg, ql, qr, tfidf_count_hash_vectorModels, sent_word2vec, sent_vocab_dict, sent_model, ner_dict, syn_dict): sent_l = Sentence() sent_l.raw_form = ql sent_l.base_form = ql wordseg_out = wordseg.segment(ql) sent_l.basic_words = wordseg_out.basic_words feature_dict = {} sent_r = Sentence() sent_r.raw_form = qr sent_r.base_form = qr wordseg_out = wordseg.segment(qr) sent_r.basic_words = wordseg_out.basic_words l_notion = get_notional_tokens(sent_l) r_notion = get_notional_tokens(sent_r) l_periph = get_periph(sent_l, qr) r_periph = get_periph(sent_r, ql) #---------------------------------------------------------------------------------------------------------- lexical_features = calc_lexical_features(sent_l, sent_r) feature_dict.update(lexical_features) periph_lexical_features = calc_periph_lexical_features(l_periph, r_periph) feature_dict.update(periph_lexical_features) mt_features = calc_mt_features(sent_l, sent_r) feature_dict.update(mt_features) count_tfidf_hash_features = get_tfidf_count_hash_features( sent_l, sent_r, tfidf_count_hash_vectorModels) feature_dict.update(count_tfidf_hash_features) notion_count_tfidf_hash_features = get_tfidf_count_hash_features( l_notion, r_notion, tfidf_count_hash_vectorModels, signature="notion") feature_dict.update(notion_count_tfidf_hash_features) sentvec_features = get_sentvec_features(sent_word2vec, sent_vocab_dict, sent_model, sent_l, sent_r) feature_dict.update(sentvec_features) sentvec_features = get_sentvec_features(sent_word2vec, sent_vocab_dict, sent_model, l_notion, r_notion, signature="notion") feature_dict.update(sentvec_features) ner_features = get_ner_features(sent_l, sent_r, ner_dict, syn_dict) feature_dict.update(ner_features) return feature_dict
def into_sentence(prefix: list[int], conn_dict: dict[int, tp.Iterable[str]], var_amount: int, var_type: str, sess) -> Sentence: s = Sentence([], sess) variables = [] for _ in range(var_amount): t = s.generate(var_type) s.append(t) variables.append(t) s = Sentence([], sess) _into_sentence(s, prefix, conn_dict, variables) return s
def parse(self): current_drill_section = None previous_line = None for current in self._labels(self._labelfile_path): logging.info(current) if not current_drill_section or \ current_drill_section.get_name() != current.section: current_drill_section = DrillSection(current.section) self._drill_sections.append(current_drill_section) if not previous_line: previous_line = current continue assert previous_line.section == current.section if previous_line.drill != current.drill: example_sentence = Sentence(previous_line.text, self._audiofile_path, previous_line.start_s, previous_line.end_s) current_drill_section.set_example(example_sentence) previous_line = current continue assert previous_line.section == current.section assert previous_line.drill == current.drill teacher_sentence = Sentence(previous_line.text, self._audiofile_path, previous_line.start_s, previous_line.end_s) previous_line = None student_sentence = Sentence(current.text, self._audiofile_path, current.start_s, current.end_s) new_drill = Drill(teacher_sentence, student_sentence) current_drill_section.add_drill(new_drill)
def test_equal_operator(self): left = Sentence([ FirstToken('Height'), WordToken('of'), WordToken('box'), PeriodToken() ]) right = Sentence([ FirstToken('Height'), WordToken('of'), WordToken('box'), PeriodToken() ]) self.assertTrue(left == right)
def setUp(self): sentence_1 = Sentence('今すぐ泳ぎなさい', '') self.sc_1 = SentenceClassifier(sentence_1) sentence_2 = Sentence('魚雷を発射するとよい', '') self.sc_2 = SentenceClassifier(sentence_2) sentence_3 = Sentence('島風を育ててはいかがでしょう', '') self.sc_3 = SentenceClassifier(sentence_3) sentence_4 = Sentence('はい、大丈夫です!', '') self.sc_4 = SentenceClassifier(sentence_4) sentence_5 = Sentence('浴室は使用前に一度、熱湯消毒をしましょう!', '') self.sc_5 = SentenceClassifier(sentence_5)
def make_data_instance(text, index): """ Takes a line of text and creates a CoNLL09Example instance from it. """ """ tagger = RnnTagger() tagger.tag(text.lstrip().rstrip()) tokenized = tagger.tokens pos_tagged = tagger.pos_tag lemmatized = tagger.lemmas """ tokenized = nltk.tokenize.word_tokenize(text.lstrip().rstrip()) pos_tagged = [p[1] for p in nltk.pos_tag(tokenized)] lemmatized = [ lemmatizer.lemmatize(tokenized[i]) if not pos_tagged[i].startswith("V") else lemmatizer.lemmatize(tokenized[i], pos='v') for i in range(len(tokenized)) ] conll_lines = [ "{}\t{}\t_\t{}\t_\t{}\t{}\t_\t_\t_\t_\t_\t_\t_\tO\n".format( i + 1, tokenized[i], lemmatized[i], pos_tagged[i], index) for i in range(len(tokenized)) ] elements = [CoNLL09Element(conll_line) for conll_line in conll_lines] sentence = Sentence(syn_type=None, elements=elements) instance = CoNLL09Example(sentence, elements) return instance
def summarize(self, document_path): sentences = {} counter = 0 with open(document_path, 'r') as f: for line in f: s = Util.tokenize(line, Summarizer.non_space) sentence = [] for w in s: sentence.append(w) if Summarizer.sentence_terminator.search(w): sent = Sentence(sentence, Summarizer.punctuation, Summarizer.stopwords, Summarizer.stemmer) sentences[sent] = (sent.tfidf(self.tf, self.df, Summarizer.NUM_DOCS), counter) sentence = [] counter += 1 totalWords = 0 selected = [] already_included = set() # Use the tf-idf score to sort the sentences for sent in sorted(sentences, key=lambda x: sentences[x][0], reverse=True): if sent not in already_included: # no duplicates already_included.add(sent) selected.append(sent) totalWords += sent.getLength() if totalWords > 100: break # return the selected sentences in their order of appearance in the document return sorted(selected, key=lambda x: sentences[x][1])
def load_stringtext(self, textString, format_list, comment_sign): lines = textString.splitlines() column_list = {} for field in format_list: if not (field.isdigit()): column_list[field] = [] length = len(format_list) for line in lines: entity = line.split() if len(entity) == length and entity[0] != comment_sign: for i in range(length): if not (format_list[i].isdigit()): column_list[format_list[i]].append( str(entity[i].encode('utf-8'))) else: if not (format_list[0].isdigit() ) and column_list[format_list[0]] != []: sent = Sentence(column_list, format_list, self.fgen) self.data_list.append(sent) column_list = {} for field in format_list: if not (field.isdigit()): column_list[field] = []
def read_spmrl_conll_file(spmrl_conll_filename): """ Read a SPMRL .conll file and return list of sentences The input file is a SPMRL file converted to conll format by the convert_mst.py script Legacy code (try to use read_conll_file instead) """ f = codecs.open(spmrl_conll_filename) lines = f.readlines() f.close() sentences = [] tokens = [] lemmas = [] poses = [] labels = [] parents = [] for line in lines: if line.strip() == '': s = Sentence(tokens, poses, labels, parents) s.set_lemmas(lemmas) sentences.append(s) tokens = [] lemmas = [] poses = [] labels = [] parents = [] else: splt = line.strip().split() tokens.append(splt[1]) lemmas.append(splt[2]) poses.append(splt[4]) # use pos and not cpos labels.append(splt[7]) parents.append(int(splt[6])) return sentences
def parse_signature(self): """Take a stream of tokens and create a Signature. Signatures have stricter rules than other parts of the language, but they are context insensitive and don't have to match definitions. This will use tokens from the stream, but may not empty the stream. :return: A Sentence reperesenting the sentence.""" token = next(self._token_stream) if not isinstance(token, FirstToken): raise ParseError('Invalid start of Signature: ' + str(token)) node = Sentence([token]) for token in self._token_stream: if isinstance(token, FirstToken): self._token_stream.push_back(token) node.append(self.parse_signature()) elif isinstance(token, WordToken): node.append(token) elif isinstance(token, PeriodToken): node.append(token) return node elif isinstance(token, ValueToken): raise ParseError('Parser.parse_signature: ValueToken not ' 'allowed in signature.') else: raise ValueError('Unknown Token Kind: {}'.format(type(token))) raise ParseError('Parser.parse_signature: fell out of the loop.')
def sentence_tokenize(self, review_text): print review_text print "\n\n\n\n\n" return [ Sentence(sent, review=self) for sent in Review.SENT_TOKENIZER.tokenize(review_text) ]
def train(self, train_path, ftype): """Train the NER model. Args: train_path: str - The path of training set. ftype: str - Indicating the feature type. """ self._nerdic = NERDic(train_path) io = self._io sentences = [] # reading the training set. for words, poss, labels in io.read_sentences(train_path): sentences.append(Sentence(labels, words, poss, self._nerdic)) feats, labels = self._prepare_feats(sentences, ftype) sep_labels = [] for i in sentence.REVERSE_LABELS.keys(): sep_labels.append([]) for label in labels: if label == i: sep_labels[i].append(1) else: sep_labels[i].append(0) print('Start first phase training...') for i, learner in enumerate(self._learners): learner.train(feats, sep_labels[i])
def form_sentence(lines, word_alpha, char_alpha, tag_alpha, symbolic_root=False, symbolic_end=False): words = [] word_ids = [] seq_chars = [] seq_char_ids = [] tags = [] tag_ids = [] edu_ids = [] if symbolic_root: words.append(ROOT) word_ids.append(word_alpha.get_index(ROOT)) seq_chars.append([ ROOT_CHAR, ]) seq_char_ids.append([ char_alpha.get_index(ROOT_CHAR), ]) tags.append(ROOT_POS) tag_ids.append(tag_alpha.get_index(ROOT_POS)) for line in lines: chars = [] char_ids = [] data = line.strip().split('\t') word = DIGIT_RE.sub(b"0", data[2]) word_id = word_alpha.get_index(word) for c in words: chars.append(c) char_ids.append(char_alpha.get_index(c)) tag = '$' if data[4] == '#' else data[4] tag_id = tag_alpha.get_index(tag) edu_id = int(data[9]) words.append(word) word_ids.append(word_id) seq_chars.append(chars) seq_char_ids.append(char_ids) tags.append(tag) tag_ids.append(tag_id) edu_ids.append(edu_id) if symbolic_end: words.append(END) word_ids.append(word_alpha.get_index(END)) seq_chars.append([ END_CHAR, ]) seq_char_ids.append([ char_alpha.get_index(END_CHAR), ]) tags.append(END_POS) tag_ids.append(tag_alpha.get_index(END_POS)) return Sentence(words, seq_chars, tags, word_ids, seq_char_ids, tag_ids, edu_ids)
def make_data_instance(text, index, get_offsets=False): """ Takes a line of text and creates a CoNLL09Example instance from it. """ tokenized = nltk.tokenize.word_tokenize(text.lstrip().rstrip()) # offsets to rebuild things offsets = [] offset = 0 for token in tokenized: offset = text.find(token, offset) offsets.append([offset, offset+len(token)]) offset += len(token) pos_tagged = [p[1] for p in nltk.pos_tag(tokenized)] lemmatized = [lemmatizer.lemmatize(tokenized[i]) if not pos_tagged[i].startswith("V") else lemmatizer.lemmatize(tokenized[i], pos='v') for i in range(len(tokenized))] conll_lines = ["{}\t{}\t_\t{}\t_\t{}\t{}\t_\t_\t_\t_\t_\t_\t_\tO\n".format( i+1, tokenized[i], lemmatized[i], pos_tagged[i], index) for i in range(len(tokenized))] elements = [CoNLL09Element(conll_line) for conll_line in conll_lines] sentence = Sentence(syn_type=None, elements=elements) if hasattr(sentence, 'tokens'): instance = CoNLL09Example(sentence, elements) else: instance = None if get_offsets: return instance, offsets else: return instance
def load_sentences(self, mode): filename = self._get_sentence_filename(mode) self.sentences = [] # clear sentences since we will reload it with open(os.path.join(self.dev_path, filename)) as f: data = json.load(f) for row in data["sentences"]: sentence = Sentence( row["start_time"], row["end_time"], self.input_locale, row["original_text"], ) if mode == SentenceIoMode.TRANSLATE: for translation in row["translated_sentences"]: if translation["lang_code"] not in (self.target_languages + [self.input_language]): continue sentence.translated_sentences[ translation["lang_code"]] = TranslatedSentence( translation["lang_code"], translation["text"]) self.sentences.append(sentence) # validate sentence start_times prev_sentence_start = 0 for idx, sentence in enumerate(self.sentences): if sentence.start_time <= prev_sentence_start: raise ValueError( f"Sentence {idx} has an invalid start time (it starts too soon)" ) prev_sentence_start = sentence.start_time
def predict(self, test_path, output_path, ftype): """Predict the test set. Args: test_path: str - The path of test set. output_path: str - The path of output file. ftype: str - Indicating the feature type. Return: list(Sentence) - The sentence with predicted labels. """ # reading the training set. io = self._io sentences = [] for words, poss, labels in io.read_sentences(test_path): sentences.append(Sentence(labels, words, poss, self._nerdic)) for sent in sentences: feats, labels = self._prepare_feats([sent], ftype) confidence = [] predict_ids = [] for learner in self._learners: confidence.append(learner.confidence(feats)) confidence = np.array(confidence) confidence = confidence.transpose() for con in confidence: predict_ids.append(np.argmax(con)) # predict_ids = self._second_learner.predict(confidence) sent.add_predict(predict_ids) io.write_sentences(output_path, sentences)
def main(argv): """Compute the sentence frequency of each term""" # How many sentences does each word appear in? lexicon = defaultdict(lambda: set()) for arg in argv: with open(arg, 'r') as fin: sentences = list() for line in fin: s = Util.tokenize(line, Summarizer.non_space) sentence = [] for w in s: sentence.append(w) if Summarizer.sentence_terminator.search(w): sent = Sentence(sentence, Summarizer.punctuation, Summarizer.stopwords, Summarizer.stemmer) sentences.append(sent) sentence = [] for sent in sentences: for w in sent.stemmed: lexicon[w].add(sent) # set() will de-duplicate sf = {} for w in lexicon: sf[w] = len(lexicon[w]) #print sf with open('sf.dat', 'wb') as out: pickle.dump(sf, out)
def __call__(self, *coms: tuple[Union[list, Sentence, int, Callable]]) -> None: """ Używane do manipulacji historią Możliwe argumenty: - `Sentence` - dodaje formułę do historii - `Callable` - wykonuje operacje `callable(history)` na obiekcie historii, a wynik nadpisuje jako nową historię; traktuj ją jako `set` - `int` - wykonuje jedną z predefiniowanych operacji: - 0 - operacja pusta - -1 - czyszczenie historii :raises TypeError: Typ nie jest obsługiwany """ for num, command in enumerate(coms): if isinstance(command, Sentence): self.add_sentence(command) elif isinstance(command, list): self.add_sentence(Sentence(command)) elif isinstance(command, function): self = History(command(self)) elif isinstance(command, int): if command == -1: # Clear set self.clear() elif command == 0: # Pass pass else: raise TypeError( f"Historia nie przyjmuje typu {type(command).__name__} (komenda {num+1}.)" )
def parse_operator(self, scope): """Parse an operator from the stream and create an operator sentence. This one does not enforce the shape of operator sentences. It is just a bit too complex to do cleanly, so it happens on the define. :return: A Sentence, may be an operator sentence or might just be an expression.""" token = next(self._token_stream) node = Sentence() part_match = scope.new_matcher() for token in self._token_stream: if isinstance(token, OperToken): if part_match.next(token): node.append(token) elif part_match.end(): self._token_stream.push_back(token) return node else: raise ParseError('Sentence not matched.', node) elif isinstance(token, FirstToken): self._token_stream.push_back(token) if part_match.next(): node.append(self.parse_expression(scope)) elif part_match.end(): return node if 1 < len(node) else node[0] else: raise ParseError('Sentence not matched.', node) else: self._token_stream.push_back(token) if part_match.end(): return node else: raise ParseError('Sentence not matched.', node) raise ParseError('Operator not closed')