def processLine(line, vob, out): global totalLine global longLine global totalChars ss = line.split("\t") sentence = Sentence() nn = len(ss) for i in range(nn): ts = ss[i].split(" ") ustr = unicode(ts[0].decode('utf8')) sentence.addToken(ustr) if sentence.chars > MAX_LEN: longLine += 1 else: x = [] y = [] totalChars += sentence.chars sentence.generate_tr_line(x, y, vob) nn = len(x) assert (nn == len(y)) for j in range(nn, MAX_LEN): x.append(0) y.append(0) line = '' for i in range(MAX_LEN): if i > 0: line += " " line += str(x[i]) for j in range(MAX_LEN): line += " " + str(y[j]) out.write("%s\n" % (line)) totalLine += 1
def generate(self, sentence: Sentence, type_: str) -> token_type: """ Generuje nowy token dla danego typu :param sentence: Zdanie, w którym zostanie użyty :type sentence: Sentence :param type_: Typ tokenu :type type_: str :return: Token w formie `[typ]_[leksem]` :rtype: str """ assert type_ in self.generator_regexes, "Type doesn't exist in this Lexicon" if type_ in self.find_new: new_lexems = generate(self.generator_regexes[type_]) used_lexems = sentence.getLexems() try: while (new_lex := next(new_lexems)) in used_lexems: pass except StopIteration: raise LexError(f"Need more lexems for the {type_} type") else: return f"{type_}_{new_lex}" else: counted = Counter( (l for t, l in sentence.getItems() if l == type_)).items() try: new_lex = max(counted, key=lambda x: x[1])[0] except ValueError: return f"{type_}_{getone(self.generator_regexes[type_])}" else: return f"{type_}_{new_lex}"
def extract_features(wordseg, ql, qr): sent_l = Sentence() sent_l.raw_form = ql sent_l.base_form = ql wordseg_out = wordseg.segment(ql) sent_l.basic_words = wordseg_out.basic_words sent_r = Sentence() sent_r.raw_form = qr sent_r.base_form = qr wordseg_out = wordseg.segment(qr) sent_r.basic_words = wordseg_out.basic_words l_periph = get_periph(sent_l, qr) r_periph = get_periph(sent_r, ql) feature_dict = {} lexical_features = calc_lexical_features(sent_l, sent_r) feature_dict.update(lexical_features) periph_lexical_features = calc_periph_lexical_features(l_periph, r_periph) feature_dict.update(periph_lexical_features) return feature_dict
def extract_features(wordseg, ql, qr, tfidf_count_hash_vectorModels): sent_l = Sentence() sent_l.raw_form = ql sent_l.base_form = ql wordseg_out = wordseg.segment(ql) sent_l.basic_words = wordseg_out.basic_words feature_dict = {} sent_r = Sentence() sent_r.raw_form = qr sent_r.base_form = qr wordseg_out = wordseg.segment(qr) sent_r.basic_words = wordseg_out.basic_words l_notion = get_notional_tokens(sent_l) r_notion = get_notional_tokens(sent_r) count_tfidf_hash_features = get_tfidf_count_hash_features(sent_l, sent_r, tfidf_count_hash_vectorModels) feature_dict.update(count_tfidf_hash_features) notion_count_tfidf_hash_features = get_tfidf_count_hash_features(l_notion, r_notion, tfidf_count_hash_vectorModels, "notion") feature_dict.update(notion_count_tfidf_hash_features) for k in feature_dict: print(k) return feature_dict
def load_sentences(self, mode): filename = self._get_sentence_filename(mode) self.sentences = [] # clear sentences since we will reload it with open(os.path.join(self.dev_path, filename)) as f: data = json.load(f) for row in data["sentences"]: sentence = Sentence( row["start_time"], row["end_time"], self.input_locale, row["original_text"], ) if mode == SentenceIoMode.TRANSLATE: for translation in row["translated_sentences"]: if translation["lang_code"] not in (self.target_languages + [self.input_language]): continue sentence.translated_sentences[ translation["lang_code"]] = TranslatedSentence( translation["lang_code"], translation["text"]) self.sentences.append(sentence) # validate sentence start_times prev_sentence_start = 0 for idx, sentence in enumerate(self.sentences): if sentence.start_time <= prev_sentence_start: raise ValueError( f"Sentence {idx} has an invalid start time (it starts too soon)" ) prev_sentence_start = sentence.start_time
def read_spmrl_conll_file(spmrl_conll_filename): """ Read a SPMRL .conll file and return list of sentences The input file is a SPMRL file converted to conll format by the convert_mst.py script Legacy code (try to use read_conll_file instead) """ f = codecs.open(spmrl_conll_filename) lines = f.readlines() f.close() sentences = [] tokens = [] lemmas = [] poses = [] labels = [] parents = [] for line in lines: if line.strip() == '': s = Sentence(tokens, poses, labels, parents) s.set_lemmas(lemmas) sentences.append(s) tokens = [] lemmas = [] poses = [] labels = [] parents = [] else: splt = line.strip().split() tokens.append(splt[1]) lemmas.append(splt[2]) poses.append(splt[4]) # use pos and not cpos labels.append(splt[7]) parents.append(int(splt[6])) return sentences
def get_original_text(conn, courseid, questionid): """ Parameters: conn: A mysql connection courseid: String, '201英语一' or '202英语二' questionid: Integer Return: original text of translation questions of '201英语一', class Sentence after preprocessing. None if courseid is '202英语二' """ original_text = "" if courseid == '202英语二': return None try: get_original_sql = "SELECT original From standards WHERE courseid=%s AND questionid=%s" get_original_cur = conn.cursor() get_original_cur.execute(get_original_sql, (courseid, questionid)) original_text = get_original_cur.fetchone()[0] except Exception as e: print("Error getting original text of courseid", courseid, "questionid", questionid) print(traceback.print_exc()) finally: get_original_cur.close() # print(original_text) original = Sentence(text=original_text, language='en') original.preprocess() return original
def convert_spacy_format(text): parsed_text = [] # instantiate Spacy's parser parser = English() # parse text via Spacy's parser doc = parser(unicode(text, "utf-8")) for sent in doc.sents: s = Sentence() s.string = str(sent) word_index = 0 for token in sent: # problem: sometimes, spacy interprets a new line in a text-file wrongly and provides an empty token. # solved: by the following condition if len(token.orth_) > 1: # Spacy's tags for each word in the sentence are stored in a new Word-object w = Word() w.string = token.orth_ w.lemma = token.lemma_ w.index = word_index # less verbose tags are provided by "token.pos_" w.tag = token.tag_ w.entity = token.ent_type_ word_index += 1 # each word is appended to a Sentence-object s.words.append(w) # each Sentence-object is appended to an array parsed_text.append(s) return parsed_text
def pop_part(sentence: Sentence, split_type: str, sent_num: int): """ Zwraca n-te podzdanie (podział według obiektów split_type) usuwając je ze zdania """ split_count = 0 start_split = 0 for s in sentence: if s.startswith(f"{split_type}_"): split_count += 1 if split_count == sent_num: break start_split += 1 if len(sentence) <= start_split or split_count < sent_num: raise IndexError("sent_num is too big") part = [] if split_count > 0: sentence.pop(start_split) while start_split < len(sentence) and not sentence[start_split].startswith( f"{split_type}_"): part.append(sentence.pop(start_split)) if len(sentence) > 0 and split_count == 0: sentence.pop(start_split) return part
def summarize(self, document_path): sentences = {} counter = 0 with open(document_path, 'r') as f: for line in f: s = Util.tokenize(line, Summarizer.non_space) sentence = [] for w in s: sentence.append(w) if Summarizer.sentence_terminator.search(w): sent = Sentence(sentence, Summarizer.punctuation, Summarizer.stopwords, Summarizer.stemmer) sentences[sent] = (sent.tfidf(self.tf, self.df, Summarizer.NUM_DOCS), counter) sentence = [] counter += 1 totalWords = 0 selected = [] already_included = set() # Use the tf-idf score to sort the sentences for sent in sorted(sentences, key=lambda x: sentences[x][0], reverse=True): if sent not in already_included: # no duplicates already_included.add(sent) selected.append(sent) totalWords += sent.getLength() if totalWords > 100: break # return the selected sentences in their order of appearance in the document return sorted(selected, key=lambda x: sentences[x][1])
def start(self, tag, attrib): self.tag = tag if tag == 'sentences': self.parse_sent = True elif tag == 'sentence': if self.parse_sent: self.sent = Sentence(int(attrib['id']) - 1) elif tag == 'dependencies': if attrib['type'] == consts.corenlp_dependency_type \ and self.parse_sent: self.parse_dep = True self.copied_dep = False elif tag == 'dep': if self.parse_dep: self.dep_label = attrib['type'] if 'extra' in attrib: self.extra = True elif tag == 'governor': if self.parse_dep: self.gov_idx = int(attrib['idx']) - 1 if 'copy' in attrib: self.copied_dep = True elif tag == 'dependent': if self.parse_dep: self.dep_idx = int(attrib['idx']) - 1 if 'copy' in attrib: self.copied_dep = True elif tag == 'coreference': if not self.parse_coref: self.parse_coref = True self.coref = Coreference(len(self.corefs)) elif tag == 'mention': if self.parse_coref: if 'representative' in attrib: self.rep = True
def extract_features(wordseg, ql, qr, sent_word2vec, word_weights, sent_model): sent_l = Sentence() sent_l.raw_form = ql sent_l.base_form = ql wordseg_out = wordseg.segment(ql) sent_l.basic_words = wordseg_out.basic_words sent_r = Sentence() sent_r.raw_form = qr sent_r.base_form = qr wordseg_out = wordseg.segment(qr) sent_r.basic_words = wordseg_out.basic_words l_notion = get_notional_tokens(sent_l) r_notion = get_notional_tokens(sent_r) feature_dict = {} sentvec_features = get_sentvec_features(sent_word2vec, word_weights, sent_model, sent_l, sent_r) feature_dict.update(sentvec_features) sentvec_features = get_sentvec_features(sent_word2vec, word_weights, sent_model, sent_l, sent_r, "notion") feature_dict.update(sentvec_features) for k, value in feature_dict.items(): print(k) print(value) return feature_dict
def extract_features(wordseg, ql, qr, tfidf_count_hash_vectorModels, sent_word2vec, sent_vocab_dict, sent_model): sent_l = Sentence() sent_l.raw_form = ql sent_l.base_form = ql wordseg_out = wordseg.segment(ql) sent_l.basic_words = wordseg_out.basic_words feature_dict = {} sent_r = Sentence() sent_r.raw_form = qr sent_r.base_form = qr wordseg_out = wordseg.segment(qr) sent_r.basic_words = wordseg_out.basic_words lexical_features = calc_lexical_features(sent_l, sent_r) feature_dict.update(lexical_features) count_tfidf_hash_features = get_tfidf_count_hash_features( sent_l, sent_r, tfidf_count_hash_vectorModels) feature_dict.update(count_tfidf_hash_features) sentvec_features = get_sentvec_features(sent_word2vec, sent_vocab_dict, sent_model, sent_l, sent_r) feature_dict.update(sentvec_features) return feature_dict
def search(sentence): s = Sentence(sentence) word_list = s.segment().filter().word_list found_list = findSimilarWords.by_word_list(word_list, 20) if len(found_list) != 0: return json.dumps(found_list) else: return json.dumps([])
def _consume_message(ch, method, properties, body): try: sentence = Sentence() sentence.from_json(body) _persist_sentence(sentence) mq_handler.ack(method) except Exception as e: logging.error("Could not consume message: " + body + ".\nException:" + str(e))
def _consume_message(ch, method, properties, body): try: original_sentence = Sentence() original_sentence.from_json(body) _translate_to_all_languages(original_sentence) mq_handler.ack(method) except Exception as e: logging.error( "Could not consume message: " + body + ".\nException:" + str(e))
def test_one_word(self): word_expected = "sentence" text = "sentence." sentence = Sentence(text) (word_actual, length) = sentence.get_longest_word() self.assertEqual(word_actual, word_expected) self.assertEqual(len(word_actual), len(word_expected))
def test_ignores_extra_whitespace(self): longest_word_expected = "antidisestablismentarianism" text = "hello, my name is antidisestablismentarianism." sentence = Sentence(text) (longest_word_actual, length) = sentence.get_longest_word() self.assertEqual(longest_word_actual, longest_word_expected) self.assertEqual(len(longest_word_actual), len(longest_word_expected))
def test_select(): expected_selected = list('以 ') expected_source = list('可 我想我。') sentence = Sentence('我想我可以。', 'I think I can.', state='可以我想我。') sentence.select(1) assert expected_selected == sentence.selected assert expected_source == sentence.source
def init_dataset(src_path, tgt_path, src_out, tgt_out): with open(src_path) as fp_in_src, open(tgt_path) as fp_in_tgt: with open(src_out, 'w') as fp_out_src, open(tgt_out, 'w') as fp_out_tgt: for src_l, tgt_l in tqdm(zip(fp_in_src, fp_in_tgt)): src, tgt = Sentence.deserialize(src_l), Sentence.deserialize( tgt_l) fp_out_src.write("{}|{}\n".format(src.source, tgt.key)) fp_out_tgt.write("{}\n".format(tgt.source))
def __init__(self, input_file, heuristic): if input_file is not None: clause_list = self.read_input(input_file) variable_set = set(map(abs, itertools.chain.from_iterable(clause_list))) self.model = Model(variable_set) self.sentence = Sentence(clause_list, self.model) self.root_stage = 0 self.heuristic = heuristic
def __iter__(self): with open(self.fr_file) as fp_fr, open(self.en_file) as fp_en: for i, (line_fr, line_en) in enumerate(zip(fp_fr, fp_en)): if self.max_line and i > self.max_line: break fr = Sentence(line_fr.strip(), 'fr', i, self.name) en = Sentence(line_en.strip(), 'en', i, self.name) yield fr, en
def __init__(self, n): Sentence.__init__(self, n) self.knowledge = {} for line in open(ngrams_dir + str(n) + '.gram').read().splitlines(): data = line.split(', ') word = data[0] self.knowledge[word] = [] for i in range(1, len(data)): self.knowledge[word].append(data[i])
def declare(kb, input): while input: sent_str, input = Sentence.next(input) sent_type = Sentence.classify(sent_str) if sent_type == 'fact': fact = Facts.parse_fact(sent_str) kb.add_fact(fact) elif sent_type == 'rule': rule = Rules.parse_rule(sent_str) kb.add_rule(rule)
class TestSemantics(TestCase): def setUp(self): self.sentence = Sentence() def test_generate(self): #self.sentence.generate(['東京', 'とは']) pass def test_get_posid(self): self.sentence._get_posid(['東京', 'とは'])
def main(n): db = DB(db_params) result = db.query( 'select activityDescription,noga1,noga2,noga3,noga4,noga5 from CONTACTS where noga1!="" AND ' 'activityDescription != "N/A" limit 10000') knowledge = {} for item in result: description = item[0].lower() nogas = [item[1], item[2], item[3], item[4], item[5]] nogas = filter(None, nogas) noga = str(nogas[0]) sentence_proc = Sentence(n) sentence_proc.set_desc(description) sentence_proc.tokenize() sentence_proc.generate_ngrams() all_ngrams = sentence_proc.get_ngrams() for ngram in all_ngrams: if ngram not in knowledge: knowledge[ngram] = [] knowledge[ngram].append(noga) for key, value in knowledge.iteritems(): value = most_common(value, 5) value = ', '.join(value) print '{0}, {1}'.format(key, value)
def calculate_sentences(self): self.topic_scores_dic = {} self.subsentence_detail_dic = {} self.score_detail_dic = {} self.word_count_dic = {} for index, sentence in self.sentences_dic.items(): s = Sentence(sentence, parser, "all_dic.csv", self.devide_topic_n, self.divide_word_n) self.topic_scores_dic[index] = s.topic_score self.score_detail_dic[index] = s.model_details() self.subsentence_detail_dic[index] = s.subsentence_detail
def read_sentences(file, speakers): """Reads sentence data from file and parses it with the speaker data.""" bulk_data = read_text(file) sentence_indecies = [] sentences = [] # Use these for index bounds of each block of sentence data. x = -1 y = -1 # Find the beginning and end indecies for each sentence block. for idx in range(0, len(bulk_data)): if len(bulk_data[idx]) > 0: if bulk_data[idx][0] == '1': x = idx else: y = idx if x > -1 and y > -1: sentence_indecies.append((x, y)) x = -1 y = -1 # Use the bounds of each sentence block to extract each sentence. # Making the assumption that while each word has a speaker ID, all sentence # blocks are uttered by the same person. This may not be true, but we # don't really care that much about the speaker data. for block in sentence_indecies: data_block = bulk_data[block[0]:block[1]] # Lets extract the speaker information. speaker_ID = data_block[0][-1] this_speaker = None for speaker in speakers: if speaker_ID == speaker.sid: this_speaker = speaker break this_word_list = [] this_sentence_list = [] for data in data_block: word_data = data.split('\t') this_word_list.append((word_data[1], word_data[3])) this_sentence_list.append(word_data[1]) sentence_text = " ".join(this_sentence_list) this_sentence = Sentence(this_speaker, sentence_text, this_word_list) this_sentence.lem() sentences.append(this_sentence) return sentences
def index(): guess = u'' bag_of_words = u'' query = u'' if request.method == 'POST': query = request.form.get('q', '') if query != '': sentence = Sentence(query, 1) bag_of_words = sentence.bag_of_words() guess = classifier.classify(bag_of_words) guess = u'Positive' if guess == 'pos' else u'Negative' return render_template('index.html', guess=guess, query=query, bag_of_words=bag_of_words)
def extract_features(wordseg, ql, qr, tfidf_count_hash_vectorModels, sent_word2vec, sent_vocab_dict, sent_model, ner_dict, syn_dict): sent_l = Sentence() sent_l.raw_form = ql sent_l.base_form = ql wordseg_out = wordseg.segment(ql) sent_l.basic_words = wordseg_out.basic_words feature_dict = {} sent_r = Sentence() sent_r.raw_form = qr sent_r.base_form = qr wordseg_out = wordseg.segment(qr) sent_r.basic_words = wordseg_out.basic_words l_notion = get_notional_tokens(sent_l) r_notion = get_notional_tokens(sent_r) l_periph = get_periph(sent_l, qr) r_periph = get_periph(sent_r, ql) #---------------------------------------------------------------------------------------------------------- lexical_features = calc_lexical_features(sent_l, sent_r) feature_dict.update(lexical_features) periph_lexical_features = calc_periph_lexical_features(l_periph, r_periph) feature_dict.update(periph_lexical_features) mt_features = calc_mt_features(sent_l, sent_r) feature_dict.update(mt_features) count_tfidf_hash_features = get_tfidf_count_hash_features( sent_l, sent_r, tfidf_count_hash_vectorModels) feature_dict.update(count_tfidf_hash_features) notion_count_tfidf_hash_features = get_tfidf_count_hash_features( l_notion, r_notion, tfidf_count_hash_vectorModels, signature="notion") feature_dict.update(notion_count_tfidf_hash_features) sentvec_features = get_sentvec_features(sent_word2vec, sent_vocab_dict, sent_model, sent_l, sent_r) feature_dict.update(sentvec_features) sentvec_features = get_sentvec_features(sent_word2vec, sent_vocab_dict, sent_model, l_notion, r_notion, signature="notion") feature_dict.update(sentvec_features) ner_features = get_ner_features(sent_l, sent_r, ner_dict, syn_dict) feature_dict.update(ner_features) return feature_dict
def predict(self, test_file): """ Predicts The language of every line in given file based on the decision tree Hypothesis. :param test_file: path of file containing untagged sentences. :return: """ test_file = open(test_file, 'r') for line in test_file: line_data = Sentence(line, False) line_data.tag = self.majority_result(line_data) print(line_data.tag)
def declare(kb, list_data): # initialize a knowledge database while list_data: current_line, list_data = Sentence.read_list( list_data) # return a comment/fact/rule and update list type = Sentence.classify(current_line) if type == 'fact': fact = Fact.parse_fact(current_line) kb.add_fact(fact) kb.add_fact_predicate(fact.predicate) elif type == 'rule': rule = Rule.parse_rule(current_line) kb.add_rule(rule) kb.add_rule_conclusion_predicate(rule.conclusion.predicate)
def parse(self): current_drill_section = None previous_line = None for current in self._labels(self._labelfile_path): logging.info(current) if not current_drill_section or \ current_drill_section.get_name() != current.section: current_drill_section = DrillSection(current.section) self._drill_sections.append(current_drill_section) if not previous_line: previous_line = current continue assert previous_line.section == current.section if previous_line.drill != current.drill: example_sentence = Sentence(previous_line.text, self._audiofile_path, previous_line.start_s, previous_line.end_s) current_drill_section.set_example(example_sentence) previous_line = current continue assert previous_line.section == current.section assert previous_line.drill == current.drill teacher_sentence = Sentence(previous_line.text, self._audiofile_path, previous_line.start_s, previous_line.end_s) previous_line = None student_sentence = Sentence(current.text, self._audiofile_path, current.start_s, current.end_s) new_drill = Drill(teacher_sentence, student_sentence) current_drill_section.add_drill(new_drill)
def test_equal_operator(self): left = Sentence([ FirstToken('Height'), WordToken('of'), WordToken('box'), PeriodToken() ]) right = Sentence([ FirstToken('Height'), WordToken('of'), WordToken('box'), PeriodToken() ]) self.assertTrue(left == right)
def compute_polarity_scores(): for i, sentence_key in enumerate(sorted(filtered_sentences)): if i < beginTrain or i > endTrain: continue print print "===============================" print "sentence#: " + str(i) + " -- " + str(i - beginTrain + 1) + "/" + str(endTrain - beginTrain + 1) print sentence_dict = filtered_sentences[sentence_key] subsentence_keys = sorted(filter(lambda x: isinstance(x, int), sentence_dict.keys())) for j, subsentence_key in enumerate(subsentence_keys): print print "-------------------------------" print "subsentence#: " + str(j + 1) + "/" + str(len(subsentence_keys)) print sentence = Sentence(sentence_dict[subsentence_key]['sentence'], sentence_key, subsentence_key, foodName) concept_polarity, filtered_concept_list = polarity.compute_concept_polarity(foodName, sentence) adj_polarity = polarity.compute_adj_polarity(foodName, sentence) dep_polarity = polarity.compute_dep_polarity(foodName, sentence) business = filter(lambda x:x["business_id"] == inputs[sentence_key]["business_id"], businesses)[0] results.append({ "rating": sentence_dict[subsentence_key]['rating'] if human else dep_polarity, "type": "manual_label" if human else "dep_polarity", "concept_polarity": concept_polarity, "adj_polarity": adj_polarity, "dep_polarity": dep_polarity, "id": i, "sentence": sentence.str_val, "sentence_key": sentence_key, "subsentence_key": subsentence_key, "business_id": inputs[sentence_key]["business_id"], "user_id": inputs[sentence_key]["user_id"], "votes": inputs[sentence_key]["votes"], "stars": inputs[sentence_key]["stars"], "lng": business["longitude"], "lat": business["latitude"], "full_address": business["full_address"], "name": business["name"], "food": foodName, "concepts": sentence.getConcepts(), "filtered_concepts": filtered_concept_list }) if human: print "rating: " + str(sentence_dict[subsentence_key]['rating']) print "concept_polarity: " + str(concept_polarity) print "adj_polarity: " + str(adj_polarity) print "dep_polarity: " + str(dep_polarity)
def parse_signature(self): """Take a stream of tokens and create a Signature. Signatures have stricter rules than other parts of the language, but they are context insensitive and don't have to match definitions. This will use tokens from the stream, but may not empty the stream. :return: A Sentence reperesenting the sentence.""" token = next(self._token_stream) if not isinstance(token, FirstToken): raise ParseError('Invalid start of Signature: ' + str(token)) node = Sentence([token]) for token in self._token_stream: if isinstance(token, FirstToken): self._token_stream.push_back(token) node.append(self.parse_signature()) elif isinstance(token, WordToken): node.append(token) elif isinstance(token, PeriodToken): node.append(token) return node elif isinstance(token, ValueToken): raise ParseError('Parser.parse_signature: ValueToken not ' 'allowed in signature.') else: raise ValueError('Unknown Token Kind: {}'.format(type(token))) raise ParseError('Parser.parse_signature: fell out of the loop.')
def segment_aspects(input_fname,out_fname): print "begin loading sentences, ......" with open(input_fname,"rt") as inf: dd = json.load(inf) sentences = [ Sentence.from_json(d) for d in dd] print "{} sentences loaded".format(len(sentences)) ######### for test and debug # sentences = random.sample(sentences,2000) ######### end test and debug seed_aspect_keywords = { "Overall":set(["recommend","recommendation","love","return","best","regret","rating"]), "Value": set(["value", "price", "quality", "worth"]), "Room": set([ "suite", "view", "bed","spacious","noisy","small"]), "Location": set(["location", "traffic", "minute", "parking","restaurant","shop","locate","subway","bus","airport","downtown"]), "Cleanliness": set(["clean", "dirty", "maintain", "smell"]), "Service": set(["staff","check", "help","service","helpful","friendly"]), "Business service": set(["business", "center", "computer", "internet","wifi","free"]) } segmenter = AspectSegmentation(sentences,seed_aspect_keywords) segmenter.run() # save resuls print "begin dumping the results, ......" save_segmentation_results(segmenter,out_fname) print "!!! DONE !!!"
def from_dict(d): r = Review() r.id = d.get("_id",None) r.business_id = d.get("business_id",None)# sometimes, we just don't care business_id r.ratings = d.get("ratings",None) r.sentences = [Sentence.from_dict(sent_dict) for sent_dict in d["sentences"]] return r
def add_sentence(self, sen): # change tokens to ints if self._int_tokens: sen = self.tokens_to_ints(sen) # create actual Sentence instance new_sen = Sentence(sen) self._corpus.append(new_sen) # filter stopwords if hasattr(self, "_stopwords"): new_sen.remove_toks(self._stopwords, self._backup) # register to index sen_index = len(self._corpus) - 1 for tok in new_sen: self._index[tok].add(sen_index)
def _translate_to(original_sentence, to_language): translation = translator.translate( original_sentence.get_text(), to_language, original_sentence.get_language()) translated_sentence = Sentence() translated_sentence.set_author(original_sentence.get_author()) translated_sentence.set_language(to_language) translated_sentence.set_text(translation) mq_handler.publish('storage', '', translated_sentence.to_json(), True)
def test_negation_suffix(): stopwords = common.make_stop_words() sentences = [ "I don't like Beijing 123, because it's too expensive", "I cannot 4 run away 56, since I am a grown man", "never ever come back again, I swear to god","without any problem","I don't think I will enjoy it: it might be too spicy" ] for index,raw_sent in enumerate(sentences): sentence = Sentence.from_raw(raw_sent,stopwords) print "\n=========================== [{}]".format(index+1) print sentence.raw print sentence.words
def tokenize_merge(row): allwords = [] for text in row.iloc[1:].dropna(): text = text.lstrip("b\'").lstrip("b\"").lstrip("b\'''") s = Sentence.from_raw(text,StopWords,neg_mark=True) allwords += s.words print allwords# show progress return allwords
def read_kb(filename): """ Creates a knowledge base from file in DIMACS format. :param filename: File's relative address. :return: Sentence instance. """ if not os.path.isfile(filename): print('ERROR: iofiles read_kb -> ' + filename + ' not found') exit() f = open(filename, 'r') # consume comments in the preamble. comment lines start with a 'c' for line in f: if line[0] != 'c': break # the next line in the file contains info on the number of clauses and # variables. the line begins with a 'p' and the format (cnf). # example line: 'p cnf 5 3' --> 5 variables and 3 clauses (nbvar, nclauses)= [int(i) for i in line.split()[2:]] new_kb = Sentence(nbvar) # each of the next lines in the file represents a clause. each line ends # with a '0'. example line: ' 1 -5 4 0' # save the clauses into an object for line in f: while line[0] == ' ': line = line[1:len(line)] if line[0] == '%': break aux_list = list() for variable in line.split()[:-1]: # discard the ending '0' variable = int(variable) aux_list.append(variable) new_kb.add_clause(tuple(aux_list)) f.close() return new_kb
def process_text(full_text, api_key): """ Translates and weighs each sentence and returns a giant JSON of the Sentence data. :param full_text: text to process :return: JSON of text with meta data """ sentences = get_sentences(full_text) trans_sentences = translate_ru2en(sentences, api_key) sentence_obj_list = [] # translate the text fully, then get the sentences in English for i in xrange(len(sentences)): ru_text = sentences[i] en_text = trans_sentences[i] weight = get_sentence_weight(ru_text) new_sentence = Sentence(ru_text, en_text, weight) sentence_obj_list.append(new_sentence.get_json()) return json.dumps(sentence_obj_list)
def serialize_sentences(input_file): f_in = open(input_file, "rb") sentences = [] context = etree.iterparse(f_in, tag="S") for event, sentence_elem in context: sent_obj = Sentence() for word_elem in sentence_elem.iter("W"): idx = word_elem.attrib["ID"] lemma = word_elem.attrib["LEMMA"] sent_obj.add_word(lemma, int(idx)) dom = word_elem.attrib["DOM"] if dom != "_root": link_type = word_elem.attrib["LINK"] sent_obj.add_link(int(idx), int(dom), link_type) sentences.append(sent_obj) return sentences
def _obj_and_predicate_dict_by_wo_from_sentences(self): """ self.sentencesから、「〜〜を〜〜」を見つけて、「AをB」にして返す 最後に計算する際、同じページでの登場番号の前後で。part-ofを判断する """ results = [] order = 0 self._set_subtypes() for i, sentence in enumerate(self.sentences): if type(sentence) == str: sentence = Sentence(text=sentence, query=self.query) if not sentence.set_noun_verb_if_good_task(): continue object_term = ObjectTerm(sentence.noun) if object_term == 'ましょ': pdb.set_trace() if object_term.core_noun in constants.STOPWORDS_OF_WEBPAGE_NOUN: continue distance_between_subtypes = {} for subtype in self.subtypes: distance = self._distance_between_subtype(i, subtype=subtype) distance_between_subtypes[subtype] = distance task = Task(object_term=object_term.name, cmp=sentence.cmp, predicate_term=sentence.verb, distance_between_subtypes=distance_between_subtypes, query=self.query, order=order, url=self.url, is_shopping=False, is_official=False, rank=self.rank, sentence=sentence.body) results.append(task) print('%s_%s_%sというタスクをセットしました' % (sentence.noun, sentence.cmp, sentence.verb)) order += 1 # 登場の順番 return results
def parse(self): this_file = os.path.dirname(__file__) current_sentence = [] for line in open(os.path.join(this_file, self.data_path)): split_line = line.split() if not split_line: self.sentences += Sentence.process_sentence_data(current_sentence) current_sentence = [] else: (word, pos_tag, source, tag) = split_line tag = tag if tag == 'O' else tag.split('-')[1] current_sentence.append((word, pos_tag, tag))
def setUp(self): self.sentence_1 = Sentence('夏野菜へのシフトをスタートさせましょう', '') self.sentence_2 = Sentence('クワなどは体格や体力に応じたものを、吟味して選ぶようにしましょう', '') self.sentence_3 = Sentence('トイレ掃除方法を、解説していきましょう', '') self.sentence_4 = Sentence('地面を掘り上げていきましょう', '') self.sentence_5 = Sentence('右に移動してください', '') self.sentence_6 = Sentence('じっくり本を読む', '') self.sentence_7 = Sentence('水まわりは汚れやすいですから、定期的に掃除してガンコな汚れがつかないように気をつけておきましょう', '')
def convert_pattern_format(text): """ Text is parsed through pattern's parsing function into a standardized format. """ parsed_text = [] # parse text via Pattern's parser pattern_parsed_text = Text(parse(text, relations=True, lemmata=True)) for sentence in pattern_parsed_text: s = Sentence() s.string = remove_blanks(sentence.string) for word in sentence: # Patterns tags for each word in the sentence are stored in a new Word-object w = Word() w.string = word.string w.lemma = word.lemma w.index = word.index w.tag = word.type w.entity = "" # each word is appended to a Sentence-object s.words.append(w) # each Sentence-object is appended to an array parsed_text.append(s) return parsed_text
def preproc_save_sentences(filename,raw_sent_stream,extra_stopwords = None): stop_words = set(stopwords.words("english")) if extra_stopwords is not None: stop_words |= set(extra_stopwords) with open(filename,"wt") as outf: outf.write("[") for index,raw_sent in enumerate( raw_sent_stream): prev_terminator = '\n' if index ==0 else ',\n' sentence = Sentence.from_raw(raw_sent,stop_words) if len(sentence.words)>0: outf.write(prev_terminator + sentence.dump_json()) print "{}-th sentence processed and saved".format(index+1) outf.write("\n]")
def bot_init(self): """ Initialize and configure your bot! Use this function to set options and initialize your own custom bot state (if any). """ self.generator = Bot() self.sentencer = Sentence() ############################ # REQUIRED: LOGIN DETAILS! # ############################ self.config['api_key'] = CONSUMER_KEY self.config['api_secret'] = CONSUMER_SECRET self.config['access_key'] = TOKEN self.config['access_secret'] = SECRET ###################################### # SEMI-OPTIONAL: OTHER CONFIG STUFF! # ###################################### MINS, HOURS = 60, 60 * 60 # how often to tweet, in seconds #self.config['tweet_interval'] = 60 * 60 # default: 30 minutes # use this to define a (min, max) random range of how often to tweet # e.g., self.config['tweet_interval_range'] = (5*60, 10*60) # tweets every 5-10 minutes self.config['tweet_interval_range'] = (30 * MINS, 3 * HOURS) # only reply to tweets that specifically mention the bot self.config['reply_direct_mention_only'] = False # only include bot followers (and original tweeter) in @-replies self.config['reply_followers_only'] = True # fav any tweets that mention this bot? self.config['autofav_mentions'] = False # fav any tweets containing these keywords? self.config['autofav_keywords'] = [] # follow back all followers? self.config['autofollow'] = False
def load_sentences(): dbname = "tripadvisor_train" client = MongoClient() db = client[dbname] sentisent_collection = db["sentiment_sentences"] # cursor = sentisent_collection.find({'sentiment':{'$lte':2}}).skip(99).limit(120) cursor = sentisent_collection.aggregate([ {'$match':{'sentiment':3}}, { '$sample': { 'size': 120 } } ]) for index,sentd in enumerate(cursor): sent = Sentence.from_dict(sentd) print "\n\n[{}] Aspect: {}, Sentiment: {}".format(index+1,sent.aspect,sent.sentiment) print sent.raw print "--------------" print sent.words client.close()
def analiza(self, sent): sentencia = sent # if self.printing: print(sentencia) # sent = Sentencia(sentencia) # self.sentencias.append(Sentencia(sentencia)) # self.sentencias[self.n] = Sentencia # self.sentencias[self.n].pals = {} l = self.tk.tokenize(sentencia); ls = self.sp.split(l,0); ls = self.mf.analyze(ls); ls = self.tg.analyze(ls); ls = self.nec.analyze(ls) ls = self.sen.analyze(ls); ls = self.parser.analyze(ls); ls = self.dep.analyze(ls); self.sent = Sentence() self.sent.set_sentence(sent) self.sent.set_con(self.con) ## output results for s in ls : # sent.pals = [] self.sent.sentence = s ws = s.get_words(); num = 0 for w in ws : self.sent.add_pal(w, num) # self.sentencias[len(self.sentencias)-1].pals.append(Pal(w, self.con)) num += 1 ''' print(w.get_form()+" "+w.get_lemma()+" "+w.get_tag()+" "+w.get_senses_string()); print (""); ''' # self.tr = s.get_parse_tree(); # self.printTree(self.tr.begin(), 0); # self.computeTree(self.tr.begin(), 0); self.dp = s.get_dep_tree(); self.computeDepTree(self.dp.begin(), 0)
def import_dataset(filename): #....Read input file input_file = open(filename, 'r') all_lines = input_file.readlines() input_file.close() #...now, for each line... dataset = [] for line_idx, line in enumerate(all_lines): #...preprocess, extract everything.... try: #...construct the sentence with its attributs from raw text... new_sentence = Sentence.create_from_raw_text(line) dataset.append(new_sentence) except Exception as e: print("Error found while processing <" + filename + ">, line: " + str(line_idx + 1)) print(line) print(e) return dataset
def test_sentence(): stopwords = text_utility.make_stop_words() texts = [ "can't is a contraction", "she isn't my wife any more", "I am not in USA right now", "I'm a Chinese", "1630 NE Valley Rd, Pullman, WA, 99163, Apt X103", "I should've done that thing I didn't do", "I don't love her any more", "I want to divorce without hesitation", "bye, Pullman, bye, USA"] for index,text in enumerate(texts): sent = Sentence.from_raw(text,stopwords,True) print "\n******************** {}".format(index+1) print sent.raw print "===>" print sent.words
def print_topics(txt): sentence = Sentence.from_raw(txt,stop_words) print "\n{}\n".format(sentence.raw) coded_words = wordcoder.code(sentence.words) bow = dictionary.doc2bow(coded_words) topic_distribution = lda_model[bow] topic_distribution.sort(key=lambda t: t[1], reverse=True) tags = None for index, (topic_id, topic_percentage) in enumerate(topic_distribution): mt = MixTopic(topic_mapping[topic_id]) mt.weight(topic_percentage) if tags is None: tags = mt else: tags.add(mt) tags.normalize() print tags
def update_add_neg_suffix(dbname,query_condition): stop_words = common.make_stop_words() client = MongoClient() review_collection = client[dbname]['reviews'] cursor = review_collection.find(query_condition,{"sentences.raw":1,"sentences.words":1}) for rindex,rd in enumerate(cursor): review = Review.from_dict(rd) update_content = {} for sindex,sent in enumerate(review.sentences): new_sent = Sentence.from_raw(sent.raw,stop_words) if set(new_sent.words) != set(sent.words): update_content["sentences.{}.words".format(sindex)] = new_sent.words if len(update_content)>0: result = review_collection.update_one({"_id":review.id},{"$set":update_content}) if result.modified_count != 1: raise Exception("failed to update review<{}>".format(review.id)) print "{}-th review updated {} sentences".format(rindex+1,len(update_content)) client.close()
def sample_split(dbname,num_train,num_test): client = MongoClient() db = client[dbname] sentisent_collection = db.sentiment_sentences ################## load and count aspect_dist = nltk.FreqDist() sentiment_dist = nltk.FreqDist() all_samples = [] cursor = sentisent_collection.aggregate([ { '$sample': { 'size': num_train + num_test } } ]) for index,d in enumerate(cursor): sent = Sentence.from_dict(d) all_samples.append( (sent.words,sent.sentiment) ) aspect_dist[sent.aspect] +=1 sentiment_dist[int(sent.sentiment)] +=1 client.close() ################## show statistics for k in aspect_dist: print '[{}]: {}'.format(k,aspect_dist.freq(k)) for k in sentiment_dist: print '[{}]: {}'.format(k,sentiment_dist.freq(k)) ################## shuffle random.shuffle(all_samples) ################## split def __dump(filename,data): with open(filename,"wb") as outf: cPickle.dump(data,outf) __dump("sentidata_train_raw.pkl",all_samples[:num_train]) __dump("sentidata_test_raw.pkl",all_samples[num_train:])