def predict(): preproses() td = TFIDF([xdata, ydata]) clasification = [] # Receives the input query from form if request.method == 'POST': namequery = request.form['namequery'] spliter = namequery.split(',') for row in spliter: clasification.append(testFromTrained([td.transform(row)])) print(clasification) keras.clear_session() labels, values = np.unique(clasification, return_counts=True) lbls, vals = np.unique(clasification, return_counts=True) pie_labels = labels pie_values = values colors = ["#F7464A", "#46BFBD"] return render_template('hasil.html', set=zip(values, labels, colors), clasification=zip(spliter, clasification), legenda=zip(lbls, vals))
def preprocess(self, filepath): dataset = pd.read_csv(filepath, delimiter=',') self.xData = [] self.yData = [] for k in dataset['Kalimat']: self.xData.append(k) for k in dataset['Formalitas']: self.yData.append(k) self.tfidf_data = TFIDF([self.xData, self.yData])
def test_tfidf(self): """ Test the TF-IDF scheme. """ idf = {'a': 2, 'b': 1, 'c': 1} tokens = ['a', 'b', 'b', 'c', 'd'] tfidf = TFIDF(idf, 3) document = tfidf.create(tokens) self.assertEqual(0, document.dimensions['a']) self.assertEqual(0.35218, round(document.dimensions['b'], 5)) self.assertEqual(0.17609, round(document.dimensions['c'], 5)) self.assertEqual(0.47712, round(document.dimensions['d'], 5))
def parseQuery(self, query, invIndex): #Both handlers return the respective TF_IDFs #docTF_IDF can be run once after crawl tfidf = TFIDF() # print invIndex docTF_IDF = tfidf.docHandler(invIndex, 0) # print docTF_IDF queryTF_IDF = self.queryHandler(query, invIndex) if queryTF_IDF == -1: print "No words from your search were found in any documents...Please try new search terms!" return -1 cosSimByDoc = self.cosSimilarityHandler(docTF_IDF, queryTF_IDF) # print "Cosine Similarity by document:", cosSimByDoc return cosSimByDoc
def __train_models(self): # Now load all sentences from specific domain, and train TFIDF model and NGramPerplexity model. self.ngp = NGramPerplexity() self.tfidf = TFIDF() print("Training models from specific corpora") for file in os.listdir(self.input_dir): print("Training models from specific corpora: " + file) with open(self.input_dir + "/" + file, encoding="utf-8") as input: for line in input: words = WordExtractor.get_words(line) if len(words) == 0: continue self.sentences.append(words) self.ngp.train_from_text(words) self.tfidf.train_from_text(words)
def main(): #try: c = corpus(); tfidf = TFIDF() tf_type='aug_freq' idf_type='inv_smooth_idf' for i, doc in enumerate(c.documents): cnt=0 print("Top words in document {}".format(i + 1)) scores = {word: tfidf.tfidf(word, doc, c.documents,tf_type, idf_type) for word in doc.words} sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) for word, score in sorted_words[:10]: cnt+=1 if(score>0): print("\tWord {}: {}, TF-IDF: {}".format(cnt, word, round(score, 5)))
def preproses(self, filepath): f = open(filepath) # split new line sents = f.read().split('\n') # shuffle all sentences order shuffle(sents) # on each sentence # - split by semicolon # - append to variable for sent in sents: temp = sent.split(';') if len(temp) == 2: self.xdata.append(temp[0]) self.ydata.append([int(temp[1])]) # prepare tfidf feature self.tfidf_data = TFIDF([self.xdata, self.ydata])
def wordcount(filename, ent_file, tfidf, text, id): resources = open(filename) resources.readline() # header wordcount = TFIDF(get_entities(ent_file)) for id, lines in groupby(csv.reader(resources), id): maintext = ' '.join(text(line).lower() for line in lines) wordcount.process(maintext) wordcount.done() out = open(tfidf, 'w') for word, _, _, tfidf in wordcount.highest(200): out.write('%s\t%f\n' % (word, tfidf))
def upload_file(): if request.method == 'POST': if 'file' not in request.files: flash('Not file part') # return redirect(request.url) file = request.files['file'] if file.filename == '': flask('not select file') # return redirect(request.url) if file and allowed_file(file.filename): filename = secure_filename(file.filename) file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename)) # return redirect(url_for('upload_file', filename=filename)) print(filename) fold = "data/" + filename print(fold) with open(fold, 'r') as csv_par: preproses() td = TFIDF([xdata, ydata]) clasification = [] csv_reader = csv_par.read().split('\n') for row in csv_reader: clasification.append(testFromTrained([td.transform(row)])) keras.clear_session() labels, values = np.unique(clasification, return_counts=True) lbls, vals = np.unique(clasification, return_counts=True) pie_labels = labels pie_values = values colors = ["#F7464A", "#46BFBD"] return render_template('hasil.html', set=zip(values, labels, colors), clasification=zip(csv_reader, clasification), legenda=zip(lbls, vals))
def parsing(): with open('data/test.csv', 'r') as csv_par: preproses() td = TFIDF([xdata, ydata]) rowdata = [] clasification = [] csv_reader = csv_par.read().split('\n') for row in csv_reader: rowdata.append(row) clasification.append(testFromTrained([td.transform(row)])) keras.clear_session() labels, values = np.unique(clasification, return_counts=True) lbls, vals = np.unique(clasification, return_counts=True) pie_labels = labels pie_values = values colors = ["#F7464A", "#46BFBD"] return render_template('hasil.html', set=zip(values, labels, colors), clasification=zip(csv_reader, clasification), legenda=zip(lbls, vals))
class Analysis: def __init__(self): self.preparation = Preparation() self.tfidf = TFIDF() def get_dataframe_from_json(self, json_data) -> pd.DataFrame: # convert json file to a pandas dataframe dataframe = self.preparation.jsonfile_to_dataframe( json.load(json_data)) # refine the dataframe by removing entries that containt no articles dataframe = self.preparation.refine_dataframe(dataframe) return dataframe def get_tfidf_from_dataframe(self, dataframe: pd.DataFrame) -> pd.DataFrame: return self.tfidf.get_tfidf_dataframe(dataframe)
def test_export(self): """ Test exporting and importing the IDF table. """ idf = {'a': 2, 'b': 1, 'c': 1} tfidf = TFIDF(idf, 3) e = tfidf.to_array() self.assertEqual(tfidf.global_scheme.documents, TFIDF.from_array(e).global_scheme.documents) self.assertEqual(tfidf.global_scheme.idf, TFIDF.from_array(e).global_scheme.idf) self.assertEqual(tfidf.local_scheme.__dict__, TFIDF.from_array(e).local_scheme.__dict__) self.assertEqual(tfidf.global_scheme.__dict__, TFIDF.from_array(e).global_scheme.__dict__)
def getRecommendation(new_df, record): temp_df = new_df[['id','name', 'album', 'artist', 'release_date']] temp_df = pd.concat([temp_df, record], ignore_index = True) col = ['name', 'album', 'artist', 'release_date'] data = pd.DataFrame(columns=col) id = [] for i in col: yield "<br/>" tf = TFIDF(temp_df, i) cosine_sim = linear_kernel(tf, tf) data[i] = cosine_sim[-1] d1 = data.sort_values(by=[i], ascending=False) id.append(list(d1.head(7).index)) tid = [] for i in range(4): track_id = [] for j in id[i]: track_id.append(temp_df.iloc[j, 0]) tid.append(track_id) return tid
def count(district, type='essays', extract_text=lambda line: ' '.join(line[3:10]), id=lambda line: line[0]): (_projectid, _teacher_acctid, _schoolid, school_ncesid, school_latitude, school_longitude, school_city, school_state, school_zip, school_metro, school_district, school_county, school_charter, school_magnet, school_year_round, school_nlns, school_kipp, school_charter_ready_promise, teacher_prefix, teacher_teach_for_america, teacher_ny_teaching_fellow, primary_focus_subject, primary_focus_area, secondary_focus_subject, secondary_focus_area, resource_usage, resource_type, poverty_level, grade_level, vendor_shipping_charges, sales_tax, payment_processing_charges, fulfillment_labor_materials, total_price_excluding_optional_support, total_price_including_optional_support, students_reached, used_by_future_students, total_donations, num_donors, eligible_double_your_impact_match, eligible_almost_home_match, funding_status, date_posted, date_completed, date_thank_you_packet_mailed, date_expiration) = range(46) proj_ids = [] projects = open('../data/projects.%scsv' % district) projects.readline().strip() # header for proj in csv.reader(projects): if proj[date_posted].startswith('2011'): proj_ids.append(proj[0]) proj_ids = frozenset(proj_ids) projects.close() wordcount = TFIDF(get_entities(ent_file)) essays = open('../data/%s.%scsv' % (type, district)) essays.readline() # header for proid, lines in groupby(csv.reader(essays), id): if proid in proj_ids: text = ' '.join(extract_text(line) for line in lines).lower() wordcount.process(text) wordcount.done() essays.close() out = open('../data/wc_%s%scsv' % (type, district), 'w') for word, tf, df, tfidf in wordcount.highest(0): out.write('%s\t%f\t%f\t%f\n' % (word, tf, df, tfidf))
class Tagger: def __init__(self): self.documents = {} self.tfidf = TFIDF() def add_document(self, document): self.documents[document.id] = document def display(self): for id in self.documents: self.documents[id].display() def get_terms_weighted_by_tfidf(self, document): documents = [self.documents[key] for key in self.documents] tfidf_list = self.tfidf.calculate_tfidf_document(documents, document) weighted_terms = {} for d in tfidf_list: term = d["term"] tf = d["tf"] idf = d["idf"] weighted_terms[term] = tf * idf return weighted_terms def get_tags_using_weighted_terms(self, weighted_terms, size=5): sorted_terms = sorted(weighted_terms.items(), key=operator.itemgetter(1), reverse=True) length = len(weighted_terms) size = length if size > length else size tags = [] for i in range(size): tags.append(sorted_terms[i][0]) return tags def __str__(self): return str(pprint(vars(self)))
def count(district, type='essays', extract_text=lambda line: ' '.join(line[3:10]), id=lambda line:line[0]): (_projectid,_teacher_acctid,_schoolid,school_ncesid,school_latitude,school_longitude,school_city,school_state,school_zip,school_metro,school_district,school_county,school_charter,school_magnet,school_year_round,school_nlns,school_kipp,school_charter_ready_promise,teacher_prefix,teacher_teach_for_america,teacher_ny_teaching_fellow,primary_focus_subject,primary_focus_area,secondary_focus_subject,secondary_focus_area,resource_usage,resource_type,poverty_level,grade_level,vendor_shipping_charges,sales_tax,payment_processing_charges,fulfillment_labor_materials,total_price_excluding_optional_support,total_price_including_optional_support,students_reached,used_by_future_students,total_donations,num_donors,eligible_double_your_impact_match,eligible_almost_home_match,funding_status,date_posted,date_completed,date_thank_you_packet_mailed,date_expiration) = range(46) proj_ids = [] projects = open('../data/projects.%scsv' % district) projects.readline().strip() # header for proj in csv.reader(projects): if proj[date_posted].startswith('2011'): proj_ids.append(proj[0]) proj_ids = frozenset(proj_ids) projects.close() wordcount = TFIDF(get_entities(ent_file)) essays = open('../data/%s.%scsv' % (type, district)) essays.readline() # header for proid, lines in groupby(csv.reader(essays), id): if proid in proj_ids: text = ' '.join(extract_text(line) for line in lines).lower() wordcount.process(text) wordcount.done() essays.close() out = open('../data/wc_%s%scsv' % (type, district), 'w') for word, tf, df, tfidf in wordcount.highest(0): out.write('%s\t%f\t%f\t%f\n' % (word, tf, df, tfidf))
def get_pred_api_set(desc): tfidf = TFIDF(desc).gen_vector() cluster = Match(tfidf).match() topN = TopN(cluster).get() return set(topN)
static.upload_folder(Sm_Cover_Dir, overwrite=True) static.upload_folder(Bg_Cover_Dir, overwrite=True) logger.info("update static server success !") with NewsDB() as db: db.update_table_newsContent(method="rebuild", fromCache=False) db.update_table_newsDetail(method="update")''' with NewsDB() as db: # 不更新 static 只更新 DB db.update_table_newsInfo(method="rebuild", fromCache=False) db.update_table_newsContent(method="rebuild", fromCache=False) db.update_table_newsDetail(method="update") WhooshIdx().create_idx() logger.info("update TFIDF ...") tfidf = TFIDF().init_for_update() tfidf.update() logger.info("update TFIDF success !") else: # 用于日常更新 with NewsDB() as db: db.update_table_newsInfo(fromCache=False) newsIDs = db.get_newsIDs() # 更新id后马上先更新静态服务器,避免在更新空档期用户访问图片造成404界面缓存 logger.info("update static server ...") static = StaticManager(newsIDs) static.download_covers() static.to_jpeg() static.cv_compress_sm() # 直接输出即可 static.cv_compress_bg()
from tfidf import TFIDF from match import Match from topN import TopN import sys desc = sys.argv[1] # online phase step 1 tfidf = TFIDF(desc).gen_vector() # online phase step 2 cluster = Match(tfidf).match() # online phase step 3 topN = TopN(cluster).get() for i in topN: print(i)
import pickle from TrainingWithTFIDF import TFIDFTrainer tfidfTrainer = TFIDFTrainer() from FeatureExtractionWithTFIDF import TFIDFPreparer from tfidf import TFIDF tfidfInstance = TFIDF() import nltk tfidfPreparer = TFIDFPreparer() class IntentDetector: def prepareForNLP(self, text): sentences = nltk.sent_tokenize(text) sentences = [nltk.word_tokenize(sent) for sent in sentences] sentences = [nltk.pos_tag(sent) for sent in sentences] return sentences def getFilterChunk(self, sentence): chunkToExtract = """ pattern: {<NNP|NNS|NN><WDT>?<VBP|VBZ>?<JJR>?<IN><CD><CC>?<CD>?} """ parser = nltk.RegexpParser(chunkToExtract) result = parser.parse(sentence) chunks = [] for subtree in result.subtrees(): if subtree.label() == 'pattern':
def start() : #initialize TFIDF tfidf = TFIDF("tfidf_data/name_and_abstracts.txt") print("TFIDF initialized") #input_file = open("publications.txt") input_file = open("pub_min.txt") while True : ''' Parse paper title. Test for EOF. ''' line = input_file.readline().strip() if len(line) == 0 : break assert line[:2] == "#*" title = line[2:] toks = word_tokenize(title) toks = sorted(toks, key=lambda x : tfidf.tf_idf(x), reverse = True) print "sorted toks:"+str(toks) ''' Parse author. ''' line = input_file.readline().strip() assert line[:2] == "#@" authors = line[2:].split(',') ''' Parse Year ''' input_file.readline() ''' Parse Venue ''' line = input_file.readline().strip() assert line[:2] == "#c" venue = line[2:] ''' Parse paper id. Do not cast to integer. Simply unnecessary. ''' line = input_file.readline().strip() assert line[:6] == "#index" id = line[6:] for a in authors : dictionary_add_set(author_papers, a, id) dictionary_add_set(author_venues, a, venue) dictionary_add_set(venue_papers, venue, id) paper_venue[id] = venue paper_authors[id] = authors ''' Parse citations. ''' line = input_file.readline().strip() while line[:2] == "#%" : ''' Invalid/empty citation. ''' if len(line) <= 2 : break dictionary_add_set(paper_papers, id, line[2:]) line = input_file.readline().strip() ''' Read the empty string line so the readline output is not confused with EOF. Sets the reading pointer to the next paper's title line. ''' line = input_file.readline() if line[:2] == "#!" : input_file.readline() return paper_authors, \ paper_papers, \ paper_venue, \ author_papers, \ venue_papers, \ author_venues
def start(tfidf_threshold): #initialize TFIDF phrase_file = open("text_segmented_by_phrase.txt", "r") for line in phrase_file: index, text = line.split("##") token_list = text.lower().strip().split("!!") id_phrases[index] = token_list phrase_file.close() tfidf = TFIDF(id_phrases.values()) print("TFIDF initialized") input_file = open("publications.txt") #input_file = open("pub_min.txt") while True: ''' Parse paper title. Test for EOF. ''' line = input_file.readline().strip() if len(line) == 0: break assert line[:2] == "#*" title = line[2:] ''' Parse author. ''' line = input_file.readline().strip() assert line[:2] == "#@" authors = line[2:].split(',') ''' Parse Year ''' input_file.readline() ''' Parse Venue ''' line = input_file.readline().strip() assert line[:2] == "#c" venue = line[2:] ''' Parse paper id. Do not cast to integer. Simply unnecessary. ''' line = input_file.readline().strip() assert line[:6] == "#index" id = line[6:] id_title[id] = title for a in authors: dictionary_add_set(author_papers, a, id) dictionary_add_set(author_venues, a, venue) dictionary_add_set(venue_papers, venue, id) paper_venue[id] = venue paper_authors[id] = authors ''' Parse citations. ''' line = input_file.readline().strip() while line[:2] == "#%": ''' Invalid/empty citation. ''' if len(line) <= 2: break dictionary_add_set(paper_papers, id, line[2:]) line = input_file.readline().strip() ''' Read the empty string line so the readline output is not confused with EOF. Sets the reading pointer to the next paper's title line. ''' line = input_file.readline() if line[:2] == "#!": input_file.readline() ''' Get terms for each paper. ''' phrase_file = open("text_segmented_by_phrase.txt", "r") for paper_id, tok_list in id_phrases.items(): ''' Assuming (id, list_of_tokens). If I'm wrong, the code will HCF. ''' toks = [x for x in tok_list if len(x) > 2 and \ tfidf.tf_idf(x) > tfidf_threshold] toks = sorted(toks, key=lambda x: tfidf.tf_idf(x), reverse=False) paper_terms[paper_id] = toks[:min(3, len(toks))] for term in paper_terms[paper_id]: if not term_papers.has_key(term): term_papers[term] = [] term_papers[term].append(paper_id) return paper_authors, \ paper_papers, \ paper_venue, \ author_papers, \ venue_papers, \ author_venues
class Analiser: xData = [] yData = [] def __init__(self, training_data='dataset\processedDataset_pool2.csv'): self.preprocess(training_data) return None def preprocess(self, filepath): dataset = pd.read_csv(filepath, delimiter=',') self.xData = [] self.yData = [] for k in dataset['Kalimat']: self.xData.append(k) for k in dataset['Formalitas']: self.yData.append(k) self.tfidf_data = TFIDF([self.xData, self.yData]) def save_model(self, model, file_name='model'): self.model_load = model model_json = model.to_json() with open('model/' + file_name + '.json', 'w') as json_file: json_file.write(model_json) model.save_weights('model/' + file_name + '.h5') print("Save model to disk") def load_model(self, file_name='model'): model = Sequential() json_file = open('model/' + file_name + '.json', 'r') loaded_model_json = json_file.read() json_file.close() model = model_from_json(loaded_model_json) model.load_weights('model/' + file_name + '.h5') print("Loaded model from disk") self.model_load = model return model def train(self, output_file='model'): x = self.tfidf_data.getOnlyXData() y = [] for i in self.yData: if i == 1: y.append([1, 0]) else: y.append([0, 1]) model = Sequential() input_data_dimen = len(x[0]) input_data_dimen = 1800 if input_data_dimen > 1800 else input_data_dimen model.add( Dense(units=int(0.39 * input_data_dimen), activation='tanh', input_dim=input_data_dimen)) model.add( Dense(units=int(0.075 * 0.39 * input_data_dimen), activation='tanh')) model.add(Dense(units=2, activation='softmax')) learning_rate = .025 batch_size = 16 loss_error = 'categorical_crossentropy' epoch = 20 sgd = SGD(lr=learning_rate) model.compile(optimizer=sgd, loss=loss_error, metrics=['accuracy']) x_train, x_test, y_train, y_test = self.train_custom_split(x, y, 0.5) self.history = model.fit(x=x_train, y=y_train, validation_data=(x_test, y_test), batch_size=batch_size, nb_epoch=epoch) self.save_model(model, output_file) def train_custom_split(self, x, y, sr_train, test_ratio=0.2): dataset = [] for i in range(len(y)): dataset.append([x[i], y[i]]) shuffle(dataset) formal = [] informal = [] for i in range(len(dataset)): if dataset[i][1][0] == 0: informal.append(dataset[i]) else: formal.append(dataset[i]) x_train = [] x_test = [] x_test_temp = [] y_train = [] y_test = [] y_test_temp = [] formal_len = len(formal) formal_rat = formal_len * sr_train inform_len = len(informal) inform_rat = inform_len * sr_train for i in range(formal_len): if i < formal_rat: x_train.append(formal[i][0]) y_train.append(formal[i][1]) else: x_test_temp.append(formal[i][0]) y_test_temp.append(formal[i][1]) for i in range(inform_len): if i < inform_rat: x_train.append(informal[i][0]) y_train.append(informal[i][1]) else: x_test_temp.append(informal[i][0]) y_test_temp.append(informal[i][1]) test_len = len(y_test_temp) test_rat = test_ratio * test_len for i in range(test_len): if i >= test_rat: x_train.append(x_test_temp[i]) y_train.append(y_test_temp[i]) else: x_test.append(x_test_temp[i]) y_test.append(y_test_temp[i]) return np.array(x_train), np.array(x_test), np.array( y_train), np.array(y_test) def getBinaryResult(self, x): print(x) return "FORMAL" if x[0][0] > x[0][1] else "NON FORMAL" def testFromTrained(self, x): if self.model_load == 'None': print("Model tidak ditemukan!") exit(0) return self.getBinaryResult(self.model_load.predict_proba(np.array(x))) def showPlot(self): history = self.history # for plotting model accuracy plt.plot(history.history['acc']) plt.plot(history.history['val_acc']) plt.title('Model Accuracy') plt.ylabel('accuracy') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') plt.show() # for plotting model loss plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('Model Loss') plt.ylabel('loss') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') plt.show()
def calcTFIDF(self): t = TFIDF() self.tfidf = t.docHandler(self.inverted_index, self.unique_id)
class DomainSimilarity: def __init__(self, input_dir, threshold_tfidf, threshold_perplexity_ngram, threshold_edit_distance): self.threshold_tfidf = 1 - threshold_tfidf self.threshold_perplexity_ngram = threshold_perplexity_ngram self.threshold_edit_distance = threshold_edit_distance self.input_dir = input_dir self.sentences = [] if not os.path.isdir(input_dir): raise Exception("The provided dir " + str(input_dir) + " does not exist") self.__train_models() self.queries_asked = 0 self.sentences_asked = 0 self.accepted_by_tfidf = 0 self.accepted_by_ngp = 0 self.accepted_by_edit_distance = 0 self.sum_tfidf = 0 self.sum_ngp = 0 self.sum_edit = 0 def __train_models(self): # Now load all sentences from specific domain, and train TFIDF model and NGramPerplexity model. self.ngp = NGramPerplexity() self.tfidf = TFIDF() print("Training models from specific corpora") for file in os.listdir(self.input_dir): print("Training models from specific corpora: " + file) with open(self.input_dir + "/" + file, encoding="utf-8") as input: for line in input: words = WordExtractor.get_words(line) if len(words) == 0: continue self.sentences.append(words) self.ngp.train_from_text(words) self.tfidf.train_from_text(words) def print_progress(self): print("Average tfidf: " + str(1 - self.sum_tfidf / self.queries_asked)) print("Average ngram-perplexity: " + str(self.sum_ngp / self.sentences_asked)) print("Average edit-distance: " + str(self.sum_edit / self.queries_asked)) print("Accept percent by tfidf extractor: " + Formatter.percent(self.accepted_by_tfidf / self.sentences_asked)) print("Accept percent by ngram-perplexity extractor: " + Formatter.percent(self.accepted_by_ngp / self.sentences_asked)) print("Accept percent by edit-distance extractor: " + Formatter.percent(self.accepted_by_edit_distance / self.sentences_asked)) def accepts_sentence(self, words_general): # sentence_general: string # Returns True if similarity of sentence_general is either: # > threshold1 according to tf-idf of one of stored sentences # > threshold2 according to ngramperplexity of one of stored sentences # > threshold3 according to levenshtein of one of stored sentences self.sentences_asked += 1 accept_ngp = False accept_tfidf = False accept_edit_distance = False perplexity = self.ngp.calc_perplexity(words_general) self.sum_ngp += perplexity if perplexity <= self.threshold_perplexity_ngram: if RUN_CONFIGURATION.mode == MODE.TURBO: return True self.accepted_by_ngp += 1 accept_ngp = True for words_specific in self.sentences: self.queries_asked += 1 if accept_tfidf and accept_edit_distance: return True if not accept_tfidf: sim = self.tfidf.calc_cosine_similarity( words_general, words_specific) self.sum_tfidf += sim if sim >= self.threshold_tfidf: if RUN_CONFIGURATION.mode == MODE.TURBO: return True self.accepted_by_tfidf += 1 accept_tfidf = True if not accept_edit_distance: edit_distance = Levenshtein.normalized_distance( words_general, words_specific) self.sum_edit += edit_distance if edit_distance <= self.threshold_edit_distance: if RUN_CONFIGURATION.mode == MODE.TURBO: return True self.accepted_by_edit_distance += 1 accept_edit_distance = True if accept_tfidf or accept_ngp or accept_edit_distance: return True return False
from tfidf import TFIDF from wordextractor import WordExtractor def calc_most_similar(tfidf, query, docs): best_doc = "" max_sim = 0 for doc in docs: sim = tfidf.calc_cosine_similarity(WordExtractor.get_words(query), WordExtractor.get_words(doc)) if sim > max_sim: max_sim = sim best_doc = doc return best_doc tfidf = TFIDF() d1 = "It is a great day today" d2 = "The weather is absolutely great" d3 = "It is so warm today, almost too hot" d4 = "We're very happy with the weather today" d5 = "It is a great day to be at the beach!" d6 = "We should get out and enjoy the weather right now :)" d7 = "I've bought a radio I plan to bring to the beach today" d8 = "The beach is a bit crowded" d9 = "There's many kids at the beach today" d10 = "If it starts to rain at the beach, I will go home" documents = [d1, d2, d3, d4, d5, d6, d7, d8, d9, d10] for doc in documents: words = WordExtractor.get_words(doc) tfidf.train_from_text(words) q1 = "We have not had rain for a long time!"
loaded_model_json = json_file.read() json_file.close() model = model_from_json(loaded_model_json) # load weights into new self.model model.load_weights("model/model.h5") print("Loaded model from disk") sgd = SGD(lr=0.01) model.compile(loss='binary_crossentropy', optimizer=sgd) return getBinaryResult(model.predict_proba(np.array(x))) preproses() td = TFIDF([xdata, ydata]) # TRAINING # train(td.getOnlyX(), ydata) # RETRAINING # retrain_model(td.getOnlyX(), ydata) # TESTING test = "ahok itu pemimpin yang beres memimpin" print test print testFromTrained([td.transform(test)]) test = "ahok itu pemimpin yang ga beres memimpin" print test print testFromTrained([td.transform(test)])
def reward2(s1, s2): indices = corpus[:] tfi = TFIDF() tfidf = tfi.get_tfidf(corpus) score = tfi.relevancy(tfidf, indices, s1, s2) return score + 1
class Query: def __init__(self, queryString=""): print "Constructing Query Object!" self.invIndex = InvertedIndex() self.tfidf = TFIDF() self.query = queryString ###################### Other Functions ###################### def removeMissingQueryTerms(self, queryList, invIndex): removedWords = [] for word in queryList: isWordInIndex = invIndex.get(word, -1) if isWordInIndex == -1: removedWords.append(word) queryList.remove(word) return removedWords ###################### Query Functions ###################### def normQueryTF(self, frequencyDict, numTerms): print numTerms, frequencyDict for key, val in frequencyDict.iteritems(): frequencyDict[key] = val/numTerms return frequencyDict def calcQueryCollectionFrequency(self, queryWords): collectionFrequency = 0 frequencyDict = {} for i, word in enumerate(queryWords): for j, checkWord in enumerate(queryWords): if word == checkWord: collectionFrequency += 1 frequencyDict[word] = collectionFrequency collectionFrequency = 0 return frequencyDict def calcQueryIDF(self, freqDict, numDocs): tempIDF = {} for key, value in freqDict.iteritems(): #df is 1 because only one document (query) for terms to be in tempIDF[key] = 1 + math.log10(numDocs/1) return tempIDF def calcQueryTF_IDF(self, normFrequencyDict, queryIDF): tf_idf = {} for key, value in normFrequencyDict.iteritems(): tf_idf[key] = normFrequencyDict[key] * queryIDF[key] return tf_idf def queryHandler(self, query, invIndex): # print "Initializing Query Handler..." queryWordList = query.split(' ') # print "Query Word List after split" # print queryWordList #Remove stop words, clean case/punct, stemm print "Parsing Query Words..." parser = Parser() queryWordList = parser.fullParse(queryWordList) # print "Query Word List after parse" # print queryWordList # print "Parsed Query Words..." removedWords = self.removeMissingQueryTerms(queryWordList, invIndex) if len(queryWordList) == 0: return -1 print "These words were not found and removed from the query: ", removedWords print "Updated Query Words List", queryWordList numTerms = len(queryWordList) numDocs = self.tfidf.findNumDocs(invIndex) freqDict = self.calcQueryCollectionFrequency(queryWordList) normFrequencyDict = self.normQueryTF(freqDict, numTerms) #I believe this is always 1 for a query because df is number of docs in collection with term and query is one doc and the only doc in the collection #calcQueryDF(temp) queryIDF = self.calcQueryIDF(freqDict, numDocs) queryTF_IDF = self.calcQueryTF_IDF(normFrequencyDict, queryIDF) return queryTF_IDF ###################### Cosine Similarity Functions ###################### def calcQueryDocDotProduct(self, docTF_IDF, queryTF_IDF): docDotProducts = {} dotProd = 0 for key, value in queryTF_IDF.iteritems(): isKeyInDoc = docTF_IDF.get(key, -1) if isKeyInDoc != -1: for k, v in docTF_IDF[key].iteritems(): isDocIdInTempDotProd = docDotProducts.get(k, -1) if isDocIdInTempDotProd == -1: docDotProducts[k] = value * v else: docDotProducts[k] += value * v else: print key, "not found in index" return docDotProducts def calcQueryEuclideanLength(self, queryTF_IDF): tempLength = 0 for key, value in queryTF_IDF.iteritems(): tempLength += value * value return math.sqrt(tempLength) def calcDocEuclideanLength(self, docTF_IDF, queryTF_IDF): tempLength = 0 tempDocEuclideanLength = {} for key, value in queryTF_IDF.iteritems(): for k, v in docTF_IDF[key].iteritems(): isDocInDict = tempDocEuclideanLength.get(k, -1) if isDocInDict == -1: tempDocEuclideanLength[k] = v*v else: tempDocEuclideanLength[k] += v*v for newKey, newVal in tempDocEuclideanLength.iteritems(): tempDocEuclideanLength[newKey] = math.sqrt(tempDocEuclideanLength[newKey]) return tempDocEuclideanLength def calcCosSimilarity(self, queryDocDotProducts, docLength, queryLength): tempCosSimDocs = {} tempCosSimVal = 0 for key, value in queryDocDotProducts.iteritems(): tempCosSimDocs[key] = (queryDocDotProducts[key])/(docLength[key]*queryLength) return tempCosSimDocs def cosSimilarityHandler(self, docTF_IDF, queryTF_IDF): print "Initializing Cosine Similarity Handler..." queryDocDotProducts = self.calcQueryDocDotProduct(docTF_IDF, queryTF_IDF) docLength = self.calcDocEuclideanLength(docTF_IDF, queryTF_IDF) queryLength = self.calcQueryEuclideanLength(queryTF_IDF) return self.calcCosSimilarity(queryDocDotProducts, docLength, queryLength) def parseQuery(self, query, invIndex): #Both handlers return the respective TF_IDFs #docTF_IDF can be run once after crawl tfidf = TFIDF() # print invIndex docTF_IDF = tfidf.docHandler(invIndex, 0) # print docTF_IDF queryTF_IDF = self.queryHandler(query, invIndex) if queryTF_IDF == -1: print "No words from your search were found in any documents...Please try new search terms!" return -1 cosSimByDoc = self.cosSimilarityHandler(docTF_IDF, queryTF_IDF) # print "Cosine Similarity by document:", cosSimByDoc return cosSimByDoc def printDictionaries(self, d): print "{:<8} {:<10}".format('DocID ,','Number') for k, v in d.iteritems(): num = v print "{:<8}{:<10}".format(k, num)
from tfidf import TFIDF from cluster import Cluster from matrices import Matrices from topN import TopN # offline phase step 1 TFIDF.gen_vector() Cluster.gen_vector() # offline phase step 2 Matrices.gen_matrices() #offline phase step 3 & step 4 TopN.gen_topN()
def start(tfidf_threshold) : #initialize TFIDF phrase_file = open("text_segmented_by_phrase.txt", "r") for line in phrase_file : index, text = line.split("##") token_list = text.lower().strip().split("!!") id_phrases[index] = token_list phrase_file.close() tfidf = TFIDF(id_phrases.values()) print("TFIDF initialized") input_file = open("publications.txt") #input_file = open("pub_min.txt") while True : ''' Parse paper title. Test for EOF. ''' line = input_file.readline().strip() if len(line) == 0 : break assert line[:2] == "#*" title = line[2:] ''' Parse author. ''' line = input_file.readline().strip() assert line[:2] == "#@" authors = line[2:].split(',') ''' Parse Year ''' input_file.readline() ''' Parse Venue ''' line = input_file.readline().strip() assert line[:2] == "#c" venue = line[2:] ''' Parse paper id. Do not cast to integer. Simply unnecessary. ''' line = input_file.readline().strip() assert line[:6] == "#index" id = line[6:] id_title[id] = title for a in authors : dictionary_add_set(author_papers, a, id) dictionary_add_set(author_venues, a, venue) dictionary_add_set(venue_papers, venue, id) paper_venue[id] = venue paper_authors[id] = authors ''' Parse citations. ''' line = input_file.readline().strip() while line[:2] == "#%" : ''' Invalid/empty citation. ''' if len(line) <= 2 : break dictionary_add_set(paper_papers, id, line[2:]) line = input_file.readline().strip() ''' Read the empty string line so the readline output is not confused with EOF. Sets the reading pointer to the next paper's title line. ''' line = input_file.readline() if line[:2] == "#!" : input_file.readline() ''' Get terms for each paper. ''' phrase_file = open("text_segmented_by_phrase.txt", "r") for paper_id, tok_list in id_phrases.items() : ''' Assuming (id, list_of_tokens). If I'm wrong, the code will HCF. ''' toks = [x for x in tok_list if len(x) > 2 and \ tfidf.tf_idf(x) > tfidf_threshold] toks = sorted(toks, key=lambda x : tfidf.tf_idf(x), reverse = False) paper_terms[paper_id] = toks[: min(3, len(toks))] for term in paper_terms[paper_id] : if not term_papers.has_key(term) : term_papers[term] = [] term_papers[term].append(paper_id) return paper_authors, \ paper_papers, \ paper_venue, \ author_papers, \ venue_papers, \ author_venues
class DomainSimilarity: def __init__(self, input_dir, threshold_tfidf, threshold_perplexity_ngram, threshold_edit_distance): self.threshold_tfidf = 1-threshold_tfidf self.threshold_perplexity_ngram = threshold_perplexity_ngram self.threshold_edit_distance = threshold_edit_distance self.input_dir = input_dir self.sentences = [] if not os.path.isdir(input_dir): raise Exception("The provided dir " + str(input_dir) + " does not exist") self.__train_models() self.queries_asked = 0 self.sentences_asked = 0 self.accepted_by_tfidf = 0 self.accepted_by_ngp = 0 self.accepted_by_edit_distance = 0 self.sum_tfidf = 0 self.sum_ngp = 0 self.sum_edit = 0 def __train_models(self): # Now load all sentences from specific domain, and train TFIDF model and NGramPerplexity model. self.ngp = NGramPerplexity() self.tfidf = TFIDF() print("Training models from specific corpora") for file in os.listdir(self.input_dir): print("Training models from specific corpora: " + file) with open(self.input_dir + "/" + file, encoding="utf-8") as input: for line in input: words = WordExtractor.get_words(line) if len(words) == 0: continue self.sentences.append(words) self.ngp.train_from_text(words) self.tfidf.train_from_text(words) def print_progress(self): print("Average tfidf: " + str(1 - self.sum_tfidf / self.queries_asked)) print("Average ngram-perplexity: " + str(self.sum_ngp / self.sentences_asked)) print("Average edit-distance: " + str(self.sum_edit / self.queries_asked)) print("Accept percent by tfidf extractor: " + Formatter.percent(self.accepted_by_tfidf / self.sentences_asked)) print("Accept percent by ngram-perplexity extractor: " + Formatter.percent(self.accepted_by_ngp / self.sentences_asked)) print("Accept percent by edit-distance extractor: " + Formatter.percent(self.accepted_by_edit_distance / self.sentences_asked)) def accepts_sentence(self, words_general): # sentence_general: string # Returns True if similarity of sentence_general is either: # > threshold1 according to tf-idf of one of stored sentences # > threshold2 according to ngramperplexity of one of stored sentences # > threshold3 according to levenshtein of one of stored sentences self.sentences_asked += 1 accept_ngp = False accept_tfidf = False accept_edit_distance = False perplexity = self.ngp.calc_perplexity(words_general) self.sum_ngp += perplexity if perplexity <= self.threshold_perplexity_ngram: if RUN_CONFIGURATION.mode == MODE.TURBO: return True self.accepted_by_ngp += 1 accept_ngp = True for words_specific in self.sentences: self.queries_asked += 1 if accept_tfidf and accept_edit_distance: return True if not accept_tfidf: sim = self.tfidf.calc_cosine_similarity(words_general, words_specific) self.sum_tfidf += sim if sim >= self.threshold_tfidf: if RUN_CONFIGURATION.mode == MODE.TURBO: return True self.accepted_by_tfidf += 1 accept_tfidf = True if not accept_edit_distance: edit_distance = Levenshtein.normalized_distance(words_general, words_specific) self.sum_edit += edit_distance if edit_distance <= self.threshold_edit_distance: if RUN_CONFIGURATION.mode == MODE.TURBO: return True self.accepted_by_edit_distance += 1 accept_edit_distance = True if accept_tfidf or accept_ngp or accept_edit_distance: return True return False
# read scrap_workbook scrap_workbook = read_scrap(args.scrap_file_name) # ## ES6 ES6_sheet = scrap_workbook["蔚来ES6"] review_container = ReviewContainer(ES6_sheet) review_list = review_container.get_review_list() doc_word_count_info_list = build_doc_word_count_info_list(review_list) ## build model data structure term_container = TermContainer(doc_word_count_info_list) inverted_file = InvertedFile(term_container, doc_word_count_info_list) # build query query_list = get_query_list(args.query_expand_workbook_path) query_expand_impl = QueryExpandImpl(args.query_expand_workbook_path) set_topk_for_query_list(query_list, args.topk) apply_query_expand_to_query_list(query_list, query_expand_impl) # search tfidf_engine = TFIDF(review_container) apply_query_search_to_query_list(query_list, inverted_file, tfidf_engine, review_container) # output_workbook workbook = Workbook() update_workbook_for_query_list(query_list, review_container, workbook) workbook.remove(workbook['Sheet']) workbook.save(args.output_path)
def __init__(self): self.documents = {} self.tfidf = TFIDF()
def __init__(self, queryString=""): print "Constructing Query Object!" self.invIndex = InvertedIndex() self.tfidf = TFIDF() self.query = queryString
def createTrainingSet(self): # initialize one class SVM models for each intent from sklearn import svm windowModel = svm.OneClassSVM(nu=0.01, kernel="linear") filterModel = svm.OneClassSVM(nu=0.01, kernel="linear") aggregateModel = svm.OneClassSVM(nu=0.01, kernel="linear") groupModel = svm.OneClassSVM(nu=0.01, kernel="linear") from tfidf import TFIDF tfidfInstance = TFIDF() documents = [] fdoc = [] adoc = [] wdoc = [] gdoc = [] import json with open('intents.json') as json_data: intentsData = json.load(json_data) for intent in intentsData['intents']: for pattern in intent['pattern']: documents.append(pattern) if intent['tag'] == "filter": fdoc.append(pattern) if intent['tag'] == "window": wdoc.append(pattern) if intent['tag'] == "aggre": adoc.append(pattern) if intent['tag'] == "group": gdoc.append(pattern) texts = [] # words relevant to the stream. These words do not help in intent detection and must be removed from FeatureExtractionWithTFIDF import TFIDFPreparer tfidfPreparer = TFIDFPreparer() for doc in documents: text = tfidfPreparer.prepareTextForTFIDF(doc) texts.append(text) self.countVectorizer, self.idf = tfidfInstance.getIDF(documents) self.tfidf_filter = tfidfInstance.getTFIDF(fdoc, self.countVectorizer, self.idf) self.tfidf_aggre = tfidfInstance.getTFIDF(adoc, self.countVectorizer, self.idf) self.tfidf_window = tfidfInstance.getTFIDF(wdoc, self.countVectorizer, self.idf) self.tfidf_group = tfidfInstance.getTFIDF(gdoc, self.countVectorizer, self.idf) x_filter = [] for i in range(len(fdoc)): total = tfidfPreparer.getSumOfCosineSimilarity( self.tfidf_filter[i], self.tfidf_filter) x_filter.append([total]) x_aggre = [] for i in range(len(adoc)): total = tfidfPreparer.getSumOfCosineSimilarity( self.tfidf_aggre[i], self.tfidf_aggre) x_aggre.append([total]) x_window = [] for i in range(len(wdoc)): total = tfidfPreparer.getSumOfCosineSimilarity( self.tfidf_window[i], self.tfidf_window) x_window.append([total]) x_group = [] for i in range(len(gdoc)): total = tfidfPreparer.getSumOfCosineSimilarity( self.tfidf_group[i], self.tfidf_group) x_group.append([total]) filterModel.fit(x_filter) windowModel.fit(x_window) aggregateModel.fit(x_aggre) groupModel.fit(x_group) import pickle filename = 'finalized_windowModel.sav' pickle.dump(windowModel, open(filename, 'wb')) filename = 'finalized_filterModel.sav' pickle.dump(filterModel, open(filename, 'wb')) filename = 'finalized_aggregateModel.sav' pickle.dump(aggregateModel, open(filename, 'wb')) filename = 'finalized_groupModel.sav' pickle.dump(groupModel, open(filename, 'wb'))
if __name__ == '__main__': # Get command-line args args_ = get_setup_args() # Download resources download(args_) # Import spacy language model nlp = spacy.blank("en") # Keep all the docs for TF-IDF initilization tfidf_docs = [] # Preprocess dataset args_.train_file = url_to_data_path(args_.train_url) args_.dev_file = url_to_data_path(args_.dev_url) if args_.include_test_examples: args_.test_file = url_to_data_path(args_.test_url) glove_dir = url_to_data_path(args_.glove_url.replace('.zip', '')) glove_ext = '.txt' if glove_dir.endswith('d') else '.{}d.txt'.format( args_.glove_dim) args_.glove_file = os.path.join(glove_dir, os.path.basename(glove_dir) + glove_ext) pre_process(args_) from tfidf import TFIDF print(len(tfidf_docs)) tfidf_scorer = TFIDF(tfidf_docs) tfidf_scorer.prepare_data() tfidf_scorer.save_to_pickle()