def kmeans_optimal(k, news): matrix = tfidf(news) centroid = start_optimal(k, matrix) ii = 0 while ii < 10: cluster = kernel_cos(centroid, matrix, k) print rss(cluster, centroid) centroid = new(cluster) ii += 1 if ii == 10: output(cluster, matrix, news)
def kmeans_euc(k, news): matrix = tfidf(news) centroid = start(k, matrix) ii = 0 while ii < 10: cluster = kernel_euc(centroid, matrix, k) print rss(cluster, centroid) centroid = new(cluster) ii += 1 if ii == 10: out = token__.labelize(news) output(cluster, matrix, out)
def m_hot_words_tfidf(data): printv('Calculateing idf score.') idf = tf_idf.idf([v[ID_M_BOW] for v in data.values()]) all_result = dict() for id, bow in data.items(): printv('Calculateing tf-idf score for ' + id) result = list() for term in set(bow[ID_M_BOW]): tf = tf_idf.tf(term, bow[ID_M_BOW]) result.append((term, round(tf_idf.tfidf(term, tf, idf), ROUND_DIGITS))) all_result[id] = dict() for word, score in sorted(result, key=lambda x: x[1], reverse=True)[:RESULT_LENGTH]: all_result[id][word] = score return all_result
def data_x(): # data=pd.read_csv('quora_duplicate_questions.tsv',delimiter='\t',encoding='utf-8') # data=data.iloc[2000:2500,:] str1 = input('Enter your ques 1 - ') str2 = input('enter your ques 2 - ') daa = [[0, 1, 2, str1, str2]] data = pd.DataFrame(daa) x = data.iloc[:, 1:5] # y=data.iloc[:,5].values x = rmpun(x) x = lower(x) dataf = pd.DataFrame() temp = pd.DataFrame() dif,diff_words,start_word,common_word,ratio,sort_ratio,jcc_sim,wo_sh=[],[],[],[],[],[],[],[] for i in range(len(x)): bf.diff_len(str(x.iloc[i, 2]), str(x.iloc[i, 3])) bf.diff_words(str(x.iloc[i, 2]), str(x.iloc[i, 3])) bf.start_words(str(x.iloc[i, 2]), str(x.iloc[i, 3])) bf.common_words(str(x.iloc[i, 2]), str(x.iloc[i, 3])) af.fuzz_ratio(str(x.iloc[i, 2]), str(x.iloc[i, 3])) af.fuzz_sort(str(x.iloc[i, 2]), str(x.iloc[i, 3])) af.jaccard(str(x.iloc[i, 2]), str(x.iloc[i, 3])) af.word_share(str(x.iloc[i, 2]), str(x.iloc[i, 3])) pos_data = af.parts_of_speech(str(x.iloc[i, 2]), str(x.iloc[i, 3])) temp = temp.append(pos_data, ignore_index=True, sort=False) dataframe = tf_idf.tfidf(str(x.iloc[i, 2]), str(x.iloc[i, 3])) print(i) dif, diwords, start_word, common_word = bf.all_data() ratio, sort_ratio, jcc_sim, wo_sh = af.all_data() dataf['deff_len'] = dif dataf['diff_words'] = diwords dataf['start_words'] = start_word dataf['common'] = common_word dataf['fuzz_ratio'] = ratio dataf['fuzz_sort'] = sort_ratio dataf['juccard'] = jcc_sim dataf['word_share'] = wo_sh dataf = pd.concat([dataf, temp], axis=1) dataf = pd.concat([dataf, dataframe], axis=1) # y=np.array(y) # dataf['target']=y dataf.to_csv('aaa.csv', index=False) return dataf
#from CNN_improved import ConvNet import os from tf_idf import tfidf def loadCorpus(folder): corpus = [] fileList = [] for subdir, dirs, files in os.walk(folder): for file in files: path = os.path.join(subdir, file) if path[-3:] == 'txt': ticker = path[len(folder) + 1:] ticker = ticker[:ticker.index('/')] year = path[path.index('_') + 1:path.index('_') + 5] fileList.append(ticker + '-' + year) report = open(path, 'r').read() corpus.append(report) return corpus, fileList corpus, fileList = loadCorpus('data-part1') data = tfidf(corpus) data.to_csv('tfidf.csv') with open("documentNames.txt", "w") as f: for fileName in fileList: f.write(str(fileName) + "\n")
#todo modified this if statement #print("query we are looking at: " + str(query)) for compword in compwords: if compword in query and parameters.use_booleanSearch: booleanSearch.constructList(collection, query) parameters.use_blindRelevance = False ranBooleanResults = True # create accumulators and other data structures accum = {} filenames = [] tfidfterms = {} p = porter.PorterStemmer() sw = StopWord.StopWord() t = thesaurus.Thesaurus() tfidf = tf_idf.tfidf() # get N f = open(collection + "_index_N", "r") N = eval(f.read()) f.close() # get document lengths/titles titles = {} f = open(collection + "_index_len", "r") lengths = f.readlines() #an array of all the file titles and their lengths f.close() titleScore = 0 # get index for each term and calculate similarities using accumulators
def rocchio(k, news): matrix = tfidf(news) centroid = start_optimal(k, matrix) kernel(centroid, matrix, news)