Пример #1
0
def textrank_labeling(apply_sub_clustering=False):
    """Labeling using textrank"""
    from summa import keywords_custom as keywords
    clusters = list(get_clusters())
    del clusters[-1]

    if apply_sub_clustering:
        clusters_labels = []
        for cluster in clusters:
            sub_clusters = sub_clustering(cluster)
            sub_clusters_size_ratio = [(len(cl) / len(cluster))
                                       for cl in sub_clusters]
            text_sub_clusters = [" ".join(cl) for cl in sub_clusters]

            sub_clusters_top_terms = []
            for text_sub_cluster in text_sub_clusters:
                terms = keywords.keywords(text_sub_cluster)
                terms = {
                    utils.stem(term): weight
                    for term, weight in terms.items()
                }
                terms = [(term, weight) for term, weight in terms.items()]
                sub_clusters_top_terms.append(terms[:10])

            terms = __combine_terms(sub_clusters_top_terms,
                                    sub_clusters_size_ratio)
            clusters_labels.append(get_labels(cluster, terms))
    else:
        text_clusters = [" ".join(cluster) for cluster in clusters]
        clusters_top_terms = []
        for text_cluster in text_clusters:
            terms = keywords.keywords(text_cluster)
            terms = {
                utils.stem(term): weight
                for term, weight in terms.items()
            }
            terms = [(term, weight) for term, weight in terms.items()]
            clusters_top_terms.append(terms[:10])

        clusters_labels = [
            get_labels(cluster, clusters_top_terms[cluster_index])
            for cluster_index, cluster in enumerate(clusters)
        ]

    print_result(clusters_labels)
    #print(clusters_labels)
    print(
        evaluate_clusters([[label[0] for label in labels]
                           for labels in clusters_labels]))
Пример #2
0
def flatten_projects(paths, outfn):
    def dursum(p):
        return sum(p.durations[j] for j in p.J) + 1

    maxT = max(dursum(project.load_project(path)) for path in paths)
    with open(outfn, 'w') as fp:
        value_count = len(project.flatten_project(paths[0], maxT))
        fp.write('instance;' +
                 ';'.join(['v' + str(ix) for ix in range(value_count)]) + '\n')
        for path in paths:
            instanceName = utils.stem(path)
            fp.write(instanceName + ';' +
                     ';'.join(project.flatten_project(path, maxT)) + '\n')
Пример #3
0
def norm(df: pd.DataFrame) -> pd.Series:
    ddf = df.copy(deep=False)
    lemmatizer = ns.WordNetLemmatizer()
    topics = []
    for t_item, o_item in zip(ddf["topics"], ddf["origin"]):
        # author topics first select
        if o_item == "ieee":
            if "Author" in t_item:
                ts = re.split(",", re.split(":", t_item)[-1])
            elif "IEEE" in t_item:
                ts = re.split(
                    ",",
                    re.search("IEEE Keywords:(.*?);", t_item).groups()[0])
            else:
                try:
                    ts = re.split(
                        ",",
                        re.search("INSPEC: Controlled Indexing:(.*?);",
                                  t_item).groups()[0])
                except:
                    ts = re.split(",", str(t_item))
        else:
            ts = re.split(",", str(t_item))

        # topic of one paper process
        ts = hero.remove_html_tags(hero.lowercase(pd.Series(ts)))
        topic = []
        for t in ts:
            t.replace(" - ", "-")
            if len(re.split("and", t)) == 2 and "-" not in t:
                topic += re.split("and", t)
                continue
            if len(re.split("/", t)) == 2:
                topic += re.split("/", t)
                continue
            if "blockchain" in t and len(re.split(" ", t)) >= 2:
                t = re.split(" ", t)[-1]
            if t != "":
                topic.append(t.replace("\xa0", ""))
        topics.append(",".join([
            similar_replace(stem(remove_chore(t), lemmatizer)) for t in topic
        ]))
        # topics.append(",".join([stem(remove_chore(t), lemmatizer) for t in topic]))
    return pd.Series(topics)
Пример #4
0
    def generate_interests_keywords(self, userid):
        # Initialise ref_topics
        self.get_ref_topics()

        # Get top-10 user topics
        user_topics = get_topics.get_topics(userid)

        # Get top-20 keywords
        top20_keywords = []
        for i in range(len(user_topics)):
            for j in range(0, 2):
                top20_keywords.append(utils.stem(user_topics[i][1][j][0]))

        # Fit top-20 keywords in ref_topics to get interests keywords
        for i in range(0, len(top20_keywords)):
            keyword = top20_keywords[i]
            for j in range(0, len(self.ref_topics)):
                if keyword in self.ref_topics[j][1]:
                    self.interests.append(self.ref_topics[j])

        return self.interests
Пример #5
0
# print(intents) 
all_words = []
tags = []
xy = []

for intent in intents['intent']:
    tag = intents['tag']
    tags.append(tag)
    for pattern in intent['patterns']:
        w = tokenize(pattern)
        all_words.extend(w)
        # use extend instead of append as we don;t want array of arrays
        xy.append((w,tag))

ignore_words = ['?','!','[',']','.',',']
all_words = [stem(w) for w in all_words if w not in ignore_words]

all_words = sorted(set(all_words))
tags = sorted(set(tags))

X_train = []
y_train = []

for (sen, tag) in xy:
    bag = bow(sen,all_words)
    X_train.append(bag)

    # multiclass label
    labels = tags.index(tag)
    y_train.append(labels)
Пример #6
0
           (mfcc_deltas_deltas_transposed, mfcc_deltas_deltas_transposed_mean)


if __name__ == '__main__':
    num_mfcc = 13
    use_deltas = True
    (sample_rate,
     signal) = wavfile.read("speakers/russian/female/anonymous104/ru_0036.wav")
    samples_without_pauses = remove_pauses(sample_rate,
                                           normalize_signal(signal))
    mfcc_features1 = get_mfcc_features(sample_rate, samples_without_pauses,
                                       num_mfcc, use_deltas)
    (sample_rate,
     signal) = wavfile.read("speakers/russian/female/anonymous104/ru_0037.wav")
    samples_without_pauses = remove_pauses(sample_rate,
                                           normalize_signal(signal))
    mfcc_features2 = get_mfcc_features(sample_rate, samples_without_pauses,
                                       num_mfcc, use_deltas)
    # Проверка значений коэффициентов в каких-либо фреймах
    plt.subplot(2, 1, 1)
    stem(mfcc_features1[0, :num_mfcc - 1], linefmt='r', markerfmt='ro')
    stem(mfcc_features1[1, :num_mfcc - 1], linefmt='b', markerfmt='bo')
    stem(mfcc_features1[20, :num_mfcc - 1], linefmt='y', markerfmt='yo')
    plt.grid(True)
    plt.subplot(2, 1, 2)
    stem(mfcc_features2[0, :num_mfcc - 1], linefmt='r', markerfmt='ro')
    stem(mfcc_features2[1, :num_mfcc - 1], linefmt='b', markerfmt='bo')
    stem(mfcc_features2[20, :num_mfcc - 1], linefmt='y', markerfmt='yo')
    plt.grid(True)
    plt.show()
Пример #7
0
def word_is_difficult(word, with_stemming=True):
	if with_stemming:
		return stem(word.lower()) not in FAMILIAR_STEMS
	return word not in FAMILIAR_WORDS
Пример #8
0
from utils import (
	avg_sentence_length_for_doc,
	doc_from_path,
	text_files_in_directory,
	load_nlp,
	words_for_doc,
	stem,
	)

def get_words(path):
	with open(path) as f:
		return [line.strip() for line in f.readlines()]

familiar_words = get_words('resources/familiar-words.txt')
FAMILIAR_WORDS = set(familiar_words)
FAMILIAR_STEMS = set([stem(w) for w in familiar_words])

def difficult_word_rate_for_doc(doc):
	words = words_for_doc(doc)
	difficult_words = [w for w in words if word_is_difficult(w)]
	return (len(difficult_words) / len(words)) * 100

def difficult_words_for_doc(doc):
	return [w for w in words_for_doc(doc) if word_is_difficult(w)]

def word_is_difficult(word, with_stemming=True):
	if with_stemming:
		return stem(word.lower()) not in FAMILIAR_STEMS
	return word not in FAMILIAR_WORDS

def dc_grade_level(doc):
Пример #9
0
objs = []    
sentences = []  # sentences for training word2vec model

ignore = set("for the a an of for on in and to as or : , . ( ) ? !".split(" "))   # word to ignore when parsing research paper title

author2paper = defaultdict(list)   
paper2author = defaultdict(dict)


with open('trial_data.json') as f:
  data = json.load(f)
  length = len(data)
  for i, obj in enumerate(data):
    if i > 50: break
    processed = [stem(word.lower()) for word in tokenize(obj['title']) if word.lower() not in ignore and not word.isdigit()] 
    obj["processed"] = processed
    sentences.append(processed)
    objs.append(obj)

    # configure paper2author and add processed arr as one of the key val pairs
    paper2author[obj["title"]]['author'] = obj["author"]
    paper2author[obj["title"]]['processed'] = processed
    paper2author[obj["title"]]['similarity'] = [float('inf')] * length
    paper2author[obj["title"]]['index'] = i

    for author in obj["author"]:
      author2paper[author].append(obj["title"])

# train the model and get the feature vector for each paper
model = models.Word2Vec(sentences, min_count=1, size=7, window=2)
Пример #10
0
def process_content(real_content, lang):
    real_content = utils.strip_accents(real_content)
    real_content = utils.remove_stopwords(real_content, lang)
    real_content = utils.stem(real_content, lang)
    return real_content
Пример #11
0
#print(intents)

all_words = []
tags = []
x_y = []
for inten in intents["intents"]:
    tag = inten["tag"]
    tags.append(tag)
    for pattern in inten["patterns"]:
        wrd = tokenize(pattern)
        all_words.extend(wrd)
        x_y.append((wrd, tag))

# ignore some symbols
ignore_sym = ['?', '.', '!', ',', "'", '-']
all_words = [stem(wrd) for wrd in all_words if wrd not in ignore_sym]
# remove duplicates and sort
all_words = sorted(set(all_words))
tags = sorted(set(tags))

#print(len(x_y), "patterns")
#print(len(tags), "tags:", tags)
#print(len(all_words), "unique words:", all_words)

# Creating data-set
X_train = []
Y_train = []
for (ptrn_sent, tag) in x_y:
    bag = bag_of_words(ptrn_sent, all_words)
    X_train.append(bag)
    label = tags.index(tag)
Пример #12
0
import json
import numpy as np
from gensim.models import Word2Vec
from utils import tokenize, stem, cosineSimilarity

model = Word2Vec.load('model.bin')

validateTitle = "Robust adaptive single neural control for a class of uncertain nonlinear systems with input nonlinearity"
ignore = set("for the a an of for on in and to as or : , . ( ) ? !".split(
    " "))  # word to ignore when parsing research paper title

processed = [
    stem(word.lower()) for word in tokenize(validateTitle)
    if word.lower() not in ignore and not word.isdigit()
]


def getFeatureVector(wordList):
    res = []
    for word in wordList:
        try:
            res.append(model[word])
        except:  # if the word has never been seen in the model, then append a numpy array of all zeroes
            res.append(np.zeros(7, dtype=np.float32)
                       )  # 7 is the model word2vec features hyperparameter
    return res


feature = getFeatureVector(processed)

author = ['Bryant Zhou']
Пример #13
0
import sys
import string
import re
import json

import xml.etree.ElementTree as ET
import nltk.data
from nltk.stem import WordNetLemmatizer

import utils

content, full_word_dict = utils.stem(utils.split(sys.argv[2]))
content = content[1:-1].split(', ')
s = utils.getStructure(content)
# dictionary
res = utils.buildDict(s, full_word_dict)
if res:
    dictionary = json.dumps(res)
    #output dictionary
    if sys.argv[1] == "Dictionary":
        print(dictionary) if dictionary else print("")

#output ecosystem/animal
if sys.argv[1] == "Topic":
    if res:
        print(next(iter(res)).title()) if next(iter(res)) else print("")
    else:
        for s in content:
            words = s.split(' ')
            for word in words:
                word = word.translate(str.maketrans('', '',
Пример #14
0
all_words = []
tags = []
xy = []

for intent in intents['intents']:
    tag = intent['tag']
    tags.append(tag)
    for pattern in intent['patterns']:
        w = tokenize(pattern)
        all_words.extend(w)
        xy.append((w, tag))

#Stem and lower each word
ignored_chars = ['?','!', '.', ',']
all_words = [stem(w) for w in all_words if w not in ignored_chars]
#Remove duplicates and sort
all_words = sorted(set(all_words)) 
tags = sorted(set(tags))

print(f"{len(xy)} Patterns\n\n{len(tags)} Tags: {tags}")

# create training data
X_train = []
y_train = []
for (pattern_sentence, tag) in xy:
    # X: bag of words for each pattern_sentence
    bag = bag_of_words(pattern_sentence, all_words)
    X_train.append(bag)
    # y: PyTorch CrossEntropyLoss needs only class labels, not one-hot
    label = tags.index(tag)
Пример #15
0
    data = [preprocess(text) for text in data]
    with open('processed.pkl', 'wb') as f:
        pickle.dump((data, y), f)


keywords = {
    0: ['love', 'people', 'time', 'day', 'life'],
    1: ['free', 'video', 'join', 'check', 'win'],
    2: ['f****d', 'ass', 'bitch', 'bad', 'shit'],
    3: ['hate', 'n***a', 'idiot', 'ass', 'trump']
}



for class_label, words in keywords.items():
    keywords[class_label] = [stem(w) for w in words]

# get count features
count_vectorizer = CountVectorizer(input='content', encoding='ascii',
                                   decode_error='ignore',
                                   strip_accents='ascii',
                                   stop_words='english', min_df=2)
count_weights = count_vectorizer.fit_transform(data)
vocabulary = count_vectorizer.vocabulary_

# get tf idf features
vectorizer = TfidfVectorizer(input='content', encoding='ascii',
                             decode_error='ignore', strip_accents='ascii',
                             stop_words='english', min_df=2,
                             vocabulary=vocabulary)
tfidf_weights = vectorizer.fit_transform(data)
    print("Please wait while we are stemming the text ... \n")

    # Processing of the input text before neighbors finding
    prog = re.compile("[_\-\(]*([A-Z]\.)*[_\-\(]*")

    for t in toProcess:
        terms = toProcess[t].split()  #stem(algo,toProcess[t]).split()
        text = ""
        for w in terms:
            if (prog.match(w)):
                w = w.replace('.', '')
                text = text + " " + w
        text = ' '.join(escape(text).split())
        text = " ".join(nltk.word_tokenize(text))
        d = []
        text = " ".join([stem(algo, w) for w in text.split()])
        d += text.split()
        toProcess[t] = d

    #print(toProcess)

    if bool(args["--stop"]):
        stopWordsList = set(stopwords.words('english'))
        print(stopWordsList)
        for i in toProcess:
            listWords = [w for w in toProcess[i] if w not in stopWordsList]
            toProcess[i] = listWords
        #print(toProcess)

    #print(toProcess)
Пример #17
0
def get_stories(id, type):
    ui = user_interests.UserInterests()
    ui.interests = []
    interests = ui.generate_interests_keywords(id)
    interests = ui.generate_interests_keywords(id)

    ALGOLIA_URL = 'https://hn.algolia.com/api/v1/'
    if type == 'top':
        ALGOLIA_URL += 'search?tags=front_page&hitsPerPage=50'
    elif type == 'new':
        ALGOLIA_URL += 'search_by_date?tags=story&hitsPerPage=50'
    elif type == 'show':
        ALGOLIA_URL += 'search_by_date?tags=show_hn&hitsPerPage=50'
    elif type == 'ask':
        ALGOLIA_URL += 'search_by_date?tags=ask_hn&hitsPerPage=50'

    req = Request(ALGOLIA_URL, headers={'User-Agent': 'Mozilla/5.0'})
    data = json.loads(urlopen(req).read().decode('utf8'))

    stories = []

    # Shuffle interests
    random.shuffle(interests)

    for i in range(0, len(data['hits'])):
        title = data['hits'][i]['title']
        url = data['hits'][i]['url']
        time = data['hits'][i]['created_at']
        author = data['hits'][i]['author']
        points = data['hits'][i]['points']
        comments_count = data['hits'][i]['num_comments']

        if url == '':
            continue

        if type == 'ask':
            title = title.replace('Ask HN:', '')
        if type == 'show':
            title = title.replace('Show HN:', '')

        words = gensim.utils.simple_preprocess(str(title), deacc=True)

        story = {}
        flag = False
        for word in words:
            word = word.lower()
            for interest in interests:
                if word in interest[1] or utils.stem(word) in interest[1]:
                    story['title'] = title
                    story['url'] = url
                    story['time'] = time
                    story['author'] = author
                    story['points'] = points
                    story['comments'] = comments_count
                    story['topics'] = interest[0]
                    stories.append(story)
                    flag = True
                    break
            if flag:
                break

    return stories[1:]
Пример #18
0
with open('intents.json', 'r') as file:
    intents = json.load(file)

all_words = []
tags = []  # all the categories
xy = []  # zip processed arrs with tag
for intent in intents['intents']:
    tag = intent['tag']
    tags.append(tag)
    for pattern in intent['patterns']:
        processed = tokenize(pattern)
        all_words.extend(processed)
        xy.append((processed, tag))

ignored = [',', '.', '?', '!']
all_words = [stem(word) for word in all_words
             if word not in ignored]  # stem all the words
all_words = sorted(set(all_words))  # get rid of duplicates
tags = sorted(set(tags))  # get rid of duplicates

X_train = []
y_train = []
for (processed_sentence, tag) in xy:
    bag = bag_of_words(processed_sentence, all_words)
    X_train.append(bag)

    label = tags.index(tag)
    y_train.append(label)

# parameters
batch_size = 8