Exemplo n.º 1
0
def predict(n, url):
    article = Article(url, language="en")
    article.download()
    article.parse()
    article.nlp()
    content = pre_process_article(str(article.text))
    title = pre_process_article(str(article.title))
    tf_idf(content, title, n)
Exemplo n.º 2
0
def runEngine(tokenizer_type, address):
    print("RUNNING ENGINE ...")
    inverted_index = load_obj("INVERTED INDEX " + tokenizer_type + "_" +
                              address)
    tfidfs = load_obj("CHAMPIONS " + tokenize_type + "_" + address)

    query = ""

    while True:
        query = input("QUERY :>")
        if query == "!q": break

        queryToken = query.split(" ")
        queryw = tf_idf(queryToken, inverted_index, False)

        h = Heap()

        for i in range(len(tfidfs)):
            if bool(set(queryToken) & set(tfidfs[i].keys())):
                sim = querySimilarity(queryw, tfidfs[i])
                if sim != 0:
                    h.addnSort([i, sim])

        k = 10
        result = h.getFirstK(k)
        titles = fetch_column(address, 'title')
        for i in range(k):
            print(titles[result[i][0]][::-1])
Exemplo n.º 3
0
def getChampions(tokens, inverted_index):
    tfidfs = []
    for i in range(len(tokens)):
        tfidfs.append(tf_idf(tokens[i], inverted_index, False))

    champions_term = {}
    champions_list = {}

    # TODO: Optimize this mess
    for term in inverted_index:
        champions_term[term] = [None] * inverted_index[term][0]
        for i in range(0, inverted_index[term][0]):
            champions_term[term][i] = tfidfs[inverted_index[term][i + 1]][term]

    for term in champions_term:
        y = list(
            zip(*heapq.nlargest(10,
                                enumerate(champions_term[term]),
                                key=operator.itemgetter(1))))[0]
        champions_list[term] = list(y)

    for term in champions_list:
        l = min(10, len(champions_list[term]))
        for i in range(l):
            champions_list[term][i] = inverted_index[term][
                champions_list[term][i] + 1]

    return champions_list
Exemplo n.º 4
0
    def make_tf_matrix(self, doc_list):
        tf_matrix = []

        for doc in doc_list:
            row = [0] * self.count
            for word in doc.split(' '):
                row[self.index[word]] = tfidf.tf_idf(word, doc, doc_list)
            tf_matrix.append(row)

        return tf_matrix
Exemplo n.º 5
0
def generate_classifier_model(tokenized_sentences, features):
    model = []

    import tfidf
    tfidf.cache_enabled(True)

    for sentence in tokenized_sentences:
        tweet_model = {}
        for i, feature in enumerate(features):
            tfidfv = tfidf.tf_idf(feature, sentence, tokenized_sentences)

            if tfidfv > 0:
                tweet_model[i] = tfidfv
        model.append(tweet_model)
    return model
Exemplo n.º 6
0
def compute_document_score_by_term(term, tags=None):
    # documents = find_documents(term)
    scores = tfidf.tf_idf(term, tags)
    # start_time = time.time()
    # res = tfidf.terms_collection.aggregate([
    #     {'$match': {'term': term}},
    #     {
    #         '$group': {
    #             '_id': "$doc",
    #             'count': {'$sum': 1}
    #         }
    #     }
    # ])
    # for doc in res:
    #     scores[doc['_id']] = tfidf.tf_idf(term, doc['_id'], doc['count'])
    # print "Find Score Time = ", str((time.time() - start_time))

    # for doc in documents:
    #     scores[doc] = tfidf.tf_idf(term, doc)
    return scores
Exemplo n.º 7
0
    def callback(self):
        print("GOT TOKEN ", self.text.get("1.0", END))
        queryToken = self.text.get("1.0", END).replace("\n", "").split(" ")
        queryToken = normalize_query(queryToken)
        print(queryToken)
        queryw = tf_idf(queryToken, self.inverted_index, False)

        h = Heap()

        for i in range(len(self.tfidfs)):
            if bool(set(queryToken) & set(self.tfidfs[i].keys())):
                sim = querySimilarity(queryw, self.tfidfs[i])
                if sim != 0:
                    h.addnSort([i, sim])

        k = 10
        result = h.getFirstK(k)
        k = min(len(result), k)
        print(k)
        for i in range(k):
            print(self.titles[result[i][0]])
            self.mylist.delete(i)
            self.mylist.insert(i, self.titles[result[i][0]])
Exemplo n.º 8
0
def get_bag_of_words_labels(preprocessed_records, args):
    """Gets the labels for the bag of words. A label can be a a single important word, a collocation of two important
    words, or a set of synonyms of a word.

    Params:
    - preprocessed_records (pyspark.rdd.RDD): The tokenized, lemmatized, lowercase records
    - ars (argparse.Namespace): The command-line arguments passed to the program

    Returns:
    - bag_of_words_labels (list<str|tuple<str>>): The labels of the bag of words created
    """
    reformatted_records = preprocessed_records.map(lambda record: (record['id'], record['preprocessed_record']))
    frequent_collocations = wordcount.extract_collocations(reformatted_records, args.num_collocations,
                                                           args.collocation_window)
    tf_idf_scores = tfidf.tf_idf(reformatted_records)
    # Pyspark technically ends here - the rest is processed on master node
    important_words = tfidf.extract_important_words(tf_idf_scores, args.num_words, False)
    # important_words_with_counts = synsets.add_word_counts(important_words, frequent_words)
    synset_dict = synsets.generate_syn_set(important_words)
    words_and_collocations = wordcount.merge_collocations_with_wordlist(frequent_collocations, important_words)
    # Merge words, collocations and synsets
    bag_of_words_labels = list()
    for item in words_and_collocations:
        if " " in item:  # item is a collocation
            bag_of_words_labels.append(item)
        elif item in synset_dict:  # item is an important word
            synset = synset_dict[item]
            if len(synset) == 1:  # synset only contains the word itself
                bag_of_words_labels.append(item)
            else:  # synset contains multiple words
                synset = [word.encode('utf-8') for word in synset[1:]]
                bag_of_words_labels.append(synset)
    # Save bag of words labels to single text file
    with open("bag_of_words_labels.json", "w") as bow_file:
        json.dump(bag_of_words_labels, bow_file)
    return bag_of_words_labels
Exemplo n.º 9
0
def main_(set_,v=None):
    data_pool = os.listdir(PATH+YEAR) #準備全樣本
    s = sorted(list(set_)) #部分樣本代碼排序
    
    
    res = []
    if v is None:
        lst = []
        
        
    
    #取開頭4個字元 轉成int 以便搜尋
    for i in range(len(data_pool)):
        data_pool[i] = int(data_pool[i][0:4])
    
    for key in s:
        res = binary_search(data_pool, key) #二分法搜尋
        lst.append(res) #return index 【搜尋特定產業CSV】
    
    for w in lst:
        write_txt(data_pool[w], YEAR)
    

    # #一次取2個進行比較
    for idx in lst:
        for idx_ in lst:
            print(str(data_pool[idx])+"<--->"+str(data_pool[idx_]))
            s1 = get_txt(data_pool[idx])
            s2 = get_txt(data_pool[idx_])
            j1 = jieba_(s1)
            j2 = jieba_(s2)
            vector = tf_idf(j1,j2)
            cos = get_cos(vector[0], vector[1])
            
            with open("化學"+YEAR+".txt", "a+" )as f:
                f.write(str(data_pool[idx])+"<--->"+str(data_pool[idx_])+" : "+ str(cos)+"\n")
Exemplo n.º 10
0
import re
import tfidf
#import re
#pattern = '[0-9]+[\t]+(.+)[\t]+[0-9]+'
#p = re.compile(pattern)
#m = p.search("\"1	theo walcott is still shit, watch rafa and johnny deal with him on saturday.	1\"")
#content = m.group(1)
#print(content)

# test tf idf
#i hava hava a dog
#i like
# i :1/2 ,have:2/2, a :1/2  dog:1/2
# i:1 like:1

#tf-idf i:0 have:1 a:1/2 dog:1/2
#i:0 like:1

res = tfidf.tf_idf([['i', 'have', 'have', 'a', 'dog'], ['i', 'like']])
tfidf.dump_tfidf_json(res, 'res.json')
Exemplo n.º 11
0
        raw_data, raw_labels = load_data_from_file()
        print(
            f'[main] Raw data dims - tweets: {len(raw_data)} / labels: {len(raw_labels)}'
        )

        # Pre-process tweets & labels:
        corpus, labels = preprocess(raw_data, raw_labels, stemmer_flag)

        store_processed_data(corpus, labels)  # save processed data to file
    print(
        f'[main] Processed data dims - corpus: {len(corpus)} / labels: {len(labels)}'
    )

    # Compute TF-IDF or Graph of Words
    if method == 'TFIDF':
        X = tf_idf(corpus)
    elif method == 'GOW':
        X = gow(corpus)
    else:
        raise ValueError(f'[main] Invalid method: {method}')

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        labels,
        test_size=test_size,
        stratify=labels,
        random_state=RANDOM_STATE,
    )
    print(
        f'[main] Training/test set dimensions: {len(y_train)}/{len(y_test)}\n')
Exemplo n.º 12
0
    algo = 'MLP'
    method = 'TFIDF'
    test_size = .2

    estimator = get_classifier(algo)
    X, y = load_processed_data()

    # create multi-label like settings
    handles = get_handles()
    y = label_binarize(y, classes=[handles[0], handles[1], handles[2]])

    n_classes = y.shape[1]

    # Compute TF-IDF or Graph of Words
    if method == 'TFIDF':
        X = tf_idf(X)
    elif method == 'GOW':
        X = gow(X)
    else:
        raise ValueError(f'[main] Invalid method: {method}')

    # Split into training and test
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, y, test_size=test_size, random_state=RANDOM_STATE)
    X_train, X_test = normalize_X(X_train, X_test)

    # Load the best model from GridSearchCV()
    model = load_model(algo, method, 'tuning_random')
    params = model.best_params_
    print(f'Best model parameters: \n{params}\n')
    #params.update({'n_jobs': n_jobs},)
Exemplo n.º 13
0
import tool
import tfidf

filename = 'tweet.txt'
content_list = tool.read_tweet_content(filename)
print(content_list[0])
print('- - ' * 50)
print(content_list[1])
stem_list = tfidf.preprocess(content_list)
print('- - ' * 50)
print(stem_list[0])
print('- - ' * 50)
print(stem_list[1])
token_2d_list = tfidf.tokenization(stem_list)
print(token_2d_list[0])
print(token_2d_list[1])

print('res')
res = tfidf.tf_idf(token_2d_list)
print('dump')
tfidf.dump_tfidf_json(res, 'tfidf.json')
Exemplo n.º 14
0
from tfidf import tf_idf
from similarity import cosine_similarity

import os
copyrighted_works='/Users/sm.marouzigmail.com/Documents/job/wattpad/library/'
user_text_file = '/Users/sm.marouzigmail.com/Documents/job/wattpad/test/test.txt'

# read the content of new uploaded text
try:
    with open(user_text_file, "r", encoding='iso-8859-1') as ifile:
        raw_text = ifile.read() # raw text of new uploaded file
except:
    with open(user_text_file, "r", encoding='utf-8') as ifile:
        raw_text = ifile.read() # raw text of new uploaded file
tfs_dict1 = tf_idf(raw_text)


library_path=os.listdir(copyrighted_works)

# list of text files
files = [file for file in library_path if file.endswith('.txt')]

for file in files:
    # read text data for each copyrighted content
    # extract TF_IDF terms and values
    # calculate the similarity between this vector and uploaded text file vector
    
    full_path_file = os.path.join(copyrighted_works, file)
    # read raw texts one by one from copyright library path
    try:
Exemplo n.º 15
0
    parser = argparse.ArgumentParser(description='Generate Inverted Index')
    parser.add_argument('address', help='csv address', action='store')
    parser.add_argument('tokenize',
                        help='type of tokenization: simple or pro',
                        default='simple')
    args = parser.parse_args()
    csv_address = args.address
    tokenize_type = args.tokenize

    print("READING ", csv_address)
    contents = fetch_column(csv_address, 'content')
    tokens = tokenize(tokenize_type, contents)

    print("TOKENIZING ", tokenize_type)
    if tokenize_type == 'pro':
        tokens = normilize(tokens)
        tokens = stem_list(tokens)

    print("BUILDING DICTIONARY AND INVERTED INDEX")
    dictionary = build_dictionary(tokens)
    inverted_index = build_inverted_index(tokens, dictionary)

    print("WRITING INDEX AND SCORES")
    writeObj("INVERTED INDEX", inverted_index, tokenize_type, csv_address)

    tfidfs = []
    for i in range(len(tokens)):
        tfidfs.append(tf_idf(tokens[i], inverted_index, True))

    writeObj("TFIDF", tfidfs, tokenize_type, csv_address)
Exemplo n.º 16
0

def indices(n):
    for i, j in product(range(n), range(n)):
        if j < i: yield i, j


if __name__ == '__main__':
    limit = 1000
    tfidf_thresh = 0.02

    vocab = open('data/vocab.txt', 'r').read().split('\n')
    embeddings = np.load('data/embeddings.npy', allow_pickle=False)
    tok2id = {tok: i for i, tok in enumerate(vocab)}

    tf, idf = tf_idf(stream_tokens(limit))

    # Compute TF-IDFs
    # and create doc representations
    print('Creating doc matrices...')
    pmid_idx = {}
    mats = []
    for pmid, toks in tqdm(stream_tokens(limit)):
        ems = []
        for tok in set(toks):
            tfidf = tf[pmid][tok] * idf[tok]
            if tfidf >= tfidf_thresh:
                tid = tok2id.get(tok)
                if tid is None:
                    continue
                em = embeddings[tid]
Exemplo n.º 17
0

def make_data():
    f = open('cl.sents.txt', 'r', encoding='utf-8')
    texts = f.readlines()
    f.close()
    docs = {}
    i = 0
    for el in texts:
        docs[i] = el
        i = i + 1
    return docs


docs = make_data()
index = tf_idf(docs)


def make_data2():
    f = open('sents.txt', 'r', encoding='utf-8')
    texts = f.readlines()
    f.close()
    docs2 = {}
    i = 0
    for el in texts:
        docs2[i] = el
        i = i + 1
    return docs2


docs2 = make_data2()
Exemplo n.º 18
0
    :return: A dictionary mapping words in wordlist to their word counts
    """
    wordlist_with_counts = dict()
    for word in wordlist:
        if word in word_counts:
            wordlist_with_counts[word] = word_counts[word]
        else:
            print(word, " not in word_counts dictionary but in given wordlist")
    return wordlist_with_counts


# End of add_word_counts()

if __name__ == "__main__":
    args = wordcount.parse_arguments()
    records = wordcount.load_records(args.file, False)
    records = wordcount.preprocess_records(records)
    frequent_words = wordcount.extract_frequent_words(records,
                                                      args.num_words * 10,
                                                      False)
    frequent_words = dict(frequent_words)
    tf_idf_scores = tfidf.tf_idf(records)
    # Pyspark technically ends here - the rest is processed on master node
    important_words = tfidf.extract_important_words(tf_idf_scores,
                                                    args.num_words, True)
    important_words_with_counts = add_word_counts(important_words,
                                                  frequent_words)
    synset_dict = generate_syn_set(important_words_with_counts.items())
    print_syn_set(synset_dict)