def kmeans_optimal(k, news):
    matrix = tfidf(news)
    centroid = start_optimal(k, matrix)
    ii = 0
    while ii < 10:
        cluster = kernel_cos(centroid, matrix, k)
        print rss(cluster, centroid)
        centroid = new(cluster)
        ii += 1
        if ii == 10:
            output(cluster, matrix, news)
def kmeans_euc(k, news):
    matrix = tfidf(news)
    centroid = start(k, matrix)
    ii = 0
    while ii < 10:
        cluster = kernel_euc(centroid, matrix, k)
        print rss(cluster, centroid)
        centroid = new(cluster)
        ii += 1
        if ii == 10:
            out = token__.labelize(news)
            output(cluster, matrix, out)
예제 #3
0
def m_hot_words_tfidf(data):
    printv('Calculateing idf score.')
    idf = tf_idf.idf([v[ID_M_BOW] for v in data.values()])
    all_result = dict()
    for id, bow in data.items():
        printv('Calculateing tf-idf score for ' + id)
        result = list()
        for term in set(bow[ID_M_BOW]):
            tf = tf_idf.tf(term, bow[ID_M_BOW])
            result.append((term, round(tf_idf.tfidf(term, tf, idf), ROUND_DIGITS)))
        all_result[id] = dict()
        for word, score in sorted(result, key=lambda x: x[1], reverse=True)[:RESULT_LENGTH]:
            all_result[id][word] = score
    return all_result
예제 #4
0
def data_x():
    # data=pd.read_csv('quora_duplicate_questions.tsv',delimiter='\t',encoding='utf-8')
    # data=data.iloc[2000:2500,:]
    str1 = input('Enter your ques 1 - ')
    str2 = input('enter your ques 2 - ')
    daa = [[0, 1, 2, str1, str2]]
    data = pd.DataFrame(daa)
    x = data.iloc[:, 1:5]
    # y=data.iloc[:,5].values
    x = rmpun(x)
    x = lower(x)

    dataf = pd.DataFrame()
    temp = pd.DataFrame()

    dif,diff_words,start_word,common_word,ratio,sort_ratio,jcc_sim,wo_sh=[],[],[],[],[],[],[],[]

    for i in range(len(x)):
        bf.diff_len(str(x.iloc[i, 2]), str(x.iloc[i, 3]))
        bf.diff_words(str(x.iloc[i, 2]), str(x.iloc[i, 3]))
        bf.start_words(str(x.iloc[i, 2]), str(x.iloc[i, 3]))
        bf.common_words(str(x.iloc[i, 2]), str(x.iloc[i, 3]))
        af.fuzz_ratio(str(x.iloc[i, 2]), str(x.iloc[i, 3]))
        af.fuzz_sort(str(x.iloc[i, 2]), str(x.iloc[i, 3]))
        af.jaccard(str(x.iloc[i, 2]), str(x.iloc[i, 3]))
        af.word_share(str(x.iloc[i, 2]), str(x.iloc[i, 3]))
        pos_data = af.parts_of_speech(str(x.iloc[i, 2]), str(x.iloc[i, 3]))
        temp = temp.append(pos_data, ignore_index=True, sort=False)
        dataframe = tf_idf.tfidf(str(x.iloc[i, 2]), str(x.iloc[i, 3]))
        print(i)

    dif, diwords, start_word, common_word = bf.all_data()
    ratio, sort_ratio, jcc_sim, wo_sh = af.all_data()

    dataf['deff_len'] = dif
    dataf['diff_words'] = diwords
    dataf['start_words'] = start_word
    dataf['common'] = common_word
    dataf['fuzz_ratio'] = ratio
    dataf['fuzz_sort'] = sort_ratio
    dataf['juccard'] = jcc_sim
    dataf['word_share'] = wo_sh
    dataf = pd.concat([dataf, temp], axis=1)
    dataf = pd.concat([dataf, dataframe], axis=1)
    # y=np.array(y)
    # dataf['target']=y
    dataf.to_csv('aaa.csv', index=False)

    return dataf
예제 #5
0
#from CNN_improved import ConvNet
import os
from tf_idf import tfidf


def loadCorpus(folder):
    corpus = []
    fileList = []
    for subdir, dirs, files in os.walk(folder):
        for file in files:
            path = os.path.join(subdir, file)
            if path[-3:] == 'txt':
                ticker = path[len(folder) + 1:]
                ticker = ticker[:ticker.index('/')]
                year = path[path.index('_') + 1:path.index('_') + 5]
                fileList.append(ticker + '-' + year)
                report = open(path, 'r').read()
                corpus.append(report)
    return corpus, fileList


corpus, fileList = loadCorpus('data-part1')
data = tfidf(corpus)

data.to_csv('tfidf.csv')

with open("documentNames.txt", "w") as f:
    for fileName in fileList:
        f.write(str(fileName) + "\n")
예제 #6
0
#todo modified this if statement
#print("query we are looking at: " + str(query))
for compword in compwords:
    if compword in query and parameters.use_booleanSearch:
        booleanSearch.constructList(collection, query)
        parameters.use_blindRelevance = False
        ranBooleanResults = True

# create accumulators and other data structures
accum = {}
filenames = []
tfidfterms = {}
p = porter.PorterStemmer()
sw = StopWord.StopWord()
t = thesaurus.Thesaurus()
tfidf = tf_idf.tfidf()

# get N
f = open(collection + "_index_N", "r")
N = eval(f.read())
f.close()

# get document lengths/titles
titles = {}
f = open(collection + "_index_len", "r")
lengths = f.readlines()  #an array of all the file titles and their lengths
f.close()

titleScore = 0

# get index for each term and calculate similarities using accumulators
def rocchio(k, news):
    matrix = tfidf(news)
    centroid = start_optimal(k, matrix)
    kernel(centroid, matrix, news)