def sentence_tags(sentence):
    """
        1文の形態素をリストで返す
    """
    nlp = NLP()
    tokens = nlp.rm(nlp.tokens(sentence))
    tags = nlp.tags(tokens)
    tags = [elem[1] for elem in tags]
    return tags
示例#2
0
    def __init__(self, configFile, db=None):

        self.configFile = configFile
        self.db = db

        self.qwd = QWD(configFile)

        UrlUtils.init(configFile)
        Validation.init(configFile)

        NLP.init(configFile)
示例#3
0
 def __init__(self):
     rospy.init_node('speech_node', anonymous=True)
     self.pub_command = rospy.Publisher('ui_command', UI, queue_size=10)
     self.pub_feedback = rospy.Publisher('ui_feedback', UI_feedback, queue_size=10)
     self.pub_floor= rospy.Publisher('lift', lift, queue_size=10)
     self.r = sr.Recognizer()
     self.mic = sr.Microphone(device_index = 2, sample_rate = 44100, chunk_size = 512)
     self.r.dynamic_energy_threshold = True
     with self.mic as source:
         self.r.adjust_for_ambient_noise(source)
     self.nlp = NLP()
     print('Speech node ready')
示例#4
0
    def get_json(self, lang, sentences):
        '''
            Explanation: Creates object of class NLP to get the list of sentences in appropriate tagged JSON format

            Parameters:
                lang: Language of the list of sentences
                sentences: List of string represeting the sentences
            
            Return:
                JSON object containing annotated values
        '''
        nlp = NLP(lang, sentences)
        return nlp.get_dependencies()
示例#5
0
 def get_corpus_from_node(self, node):
     '''
     Retrieve a set of text file paths from category graph node
     return a [] of texts path
     '''
     texts = []
     for myfile in os.listdir(self.categories[node]["path"]):
         if myfile.endswith(".txt"):
             path=os.path.join(self.categories[node]["path"], myfile)
             with codecs.open (path, "r", "utf-8") as myfile:
                 text=myfile.read().replace('\n', '')
                 result=NLP(text).get_clean_text()
                 texts.append(result.encode("utf-8"))
     return texts
示例#6
0
 def __init__(self, lang='en', **kwargs):
     """
     Do not use this object for anything else besides training models because normalization of documents
     takes a while.
     TODO: in an actual solution documents should be normalized and those normalized documents should be stored
     TODO: as corpus for fast loading when initiating a training procedure.
     :param lang: str
     :param kwargs: dict
     """
     self.datasets = kwargs.get('datasets', DATASETS)
     self.nlp = NLP(lang)
     self.tagged_docs = []
     self.__init_tagged_docs(
     )  # The dataset is small enough to be loaded in RAM
示例#7
0
    def __init__(self):
        WXBot.__init__(self)

        self.tuling_key = ""
        self.robot_switch = True

        self.nlp = NLP()
        self.reply_list = ['温', '司尚春', '亲爱的']
        try:
            cf = ConfigParser.ConfigParser()
            cf.read('conf.ini')
            self.tuling_key = cf.get('main', 'key')
        except Exception:
            pass
        print 'tuling_key:', self.tuling_key
示例#8
0
    def classifier(features, labels):
        features_copy = features.copy()
        labels_copy = labels.copy()

        features_copy = NLP.text_to_numeric(features_copy)

        results = NLP.test(features_copy, labels)
        betterFM = 0

        for classifier in results:
            metrics = results[classifier]
            if (metrics['F1']['avg'] > betterFM):
                betterFM = metrics['F1']['avg']

        return betterFM
示例#9
0
def call_find_template_set(arg_json, arg_nlp, arg_templates):
    examples = LabeledExample.read(arg_json)
    indices = [e.index for e in examples.itervalues()]
    natural_language = {i: NLP.read(arg_nlp, i) for i in indices}
    word_problems = [WordProblem(examples[i], natural_language[i])
                     for i in indices]
    templates = [wp.extract_template() for wp in word_problems]

    unique = list()
    wp_template_map = dict()
    for wp in word_problems:
        template = wp.template
        wp_index = wp.labeled_example.index
        found_template = False
        for unique_i, u in enumerate(unique):
            if template == u:
                wp_template_map[wp_index] = unique_i
                found_template = True
                break

        if not found_template:
            unique.append(template)
            wp_template_map[wp_index] = len(unique) - 1

    print('{} total and {} unique templates'.format(len(templates),
                                                    len(unique)))
    with open(arg_templates, 'wt') as f_handle:
        out_json = {'templates': [t.to_json() for t in unique],
                    'wp_template_map': wp_template_map}
        f_handle.write(json.dumps(out_json))
示例#10
0
def call_extract_features(arg_json, arg_nlp, arg_templates, arg_parameters):
    examples = LabeledExample.read(arg_json)
    indices = [e.index for e in examples.itervalues()]
    natural_language = {i: NLP.read(arg_nlp, i) for i in indices}
    word_problems = [WordProblem(examples[i], natural_language[i])
                     for i in indices]

    with open(arg_templates, 'rt') as f_handle:
        raw = f_handle.read()

    parsed = json.loads(raw)
    unique_templates = [Template.from_json(j) for j in parsed['templates']]
    # TODO(Eric): using only 2 word problems for testing
    unique_templates = unique_templates[:2]
    word_problems = word_problems[:2]

    feature_extractor = FeatureExtractor(unique_templates, word_problems)
    derivations = initialize_partial_derivations_for_all_templates(
        word_problems[0], unique_templates)
    derivation = derivations[0]
    while not derivation.is_complete():
        derivation = derivation.all_ways_to_fill_next_slot()[0]

    print(feature_extractor.extract(derivation))
    print(derivation)
示例#11
0
class Datasets:
    def __init__(self, lang='en', **kwargs):
        """
        Do not use this object for anything else besides training models because normalization of documents
        takes a while.
        TODO: in an actual solution documents should be normalized and those normalized documents should be stored
        TODO: as corpus for fast loading when initiating a training procedure.
        :param lang: str
        :param kwargs: dict
        """
        self.datasets = kwargs.get('datasets', DATASETS)
        self.nlp = NLP(lang)
        self.tagged_docs = []
        self.__init_tagged_docs(
        )  # The dataset is small enough to be loaded in RAM

    def __init_tagged_docs(self):
        for dataset_name in self.datasets:
            dataset = initialize_dataset(dataset_name)
            for doc, labels in dataset.get_documents_labels():
                self.tagged_docs.append(
                    TaggedDocument(words=self.nlp.normalize_text(doc),
                                   tags=labels))

    def __iter__(self):
        for tagged_document in self.tagged_docs:
            yield tagged_document
示例#12
0
class Doc2VecEmbedding:
    def __init__(self, lang='en'):
        self.lang = lang
        self.model = None
        self.nlp = NLP(self.lang)

    def __get_model_fpath(self):
        return join(MODELS_DIR, 'doc2vec_%s.model' % self.lang)

    def load(self):
        self.model = Doc2Vec.load(self.__get_model_fpath())

    def fit(self, **kwargs):
        logging.basicConfig(format='%(levelname)s : %(message)s',
                            level=logging.INFO)
        logging.root.level = logging.INFO
        datasets = Datasets()
        params = DOC2VEC_PARAMS
        self.model = Doc2Vec(datasets.tagged_docs, **params)
        self.model.save(self.__get_model_fpath())

    def vectorize(self, text):
        tokens = self.nlp.normalize_text(text)
        return self.model.infer_vector(tokens, steps=256)

    def find_most_similar_docs(self, text, topn=10):
        vector = self.vectorize(text)
        return self.model.docvecs.most_similar(positive=[vector], topn=topn)
示例#13
0
def main(args):
    # Load configuration
    config = Configuration(args.yaml_path)

    print("Loading Probase...")
    probase = Probase(config)

    print("Loading dataset...")
    dataset = Data(config)

    print("Loading NLP utility...")
    nlp = NLP('en')

    print("Loading feature extractor...")
    features = Feature(config, probase, nlp=nlp)

    print("Extracting vector features")
    features.extract_vector_features(dataset)

    print("Extracting statistical vector features")
    features.extract_statistical_features(dataset)

    print("Evaluating clasifiers")
    ev = Evaluation(config, dataset)
    ev.full_evaluation(features.X, features.y)
示例#14
0
文件: robot.py 项目: gcd0318/robot
 def __init__(self, name):
     self.name = name
     self.ai = GCAI(qad={
         '你是谁': '我是' + self.name,
         self.name: '干什么',
     })
     self.voice = Voice(VOICE_REGS)
     self.nlp = NLP(NLP_REGS)
示例#15
0
 def __init__(self, provider, output) -> None:
     super().__init__()
     self.provider = provider
     self.output = output
     self.nlp = NLP(NameProvider("gn_csv/mena.csv"),
                    TownProvider("gn_csv/GNKU.csv"),
                    GeoProvider("gn_csv/geograficky_nazov.csv"))
     self.cache = Cache(100000, output)
示例#16
0
def server(config):
    print('Compiling assets...')
    compile_assets()

    print('Loading NLP models...')
    app.nlp = NLP(config)

    return app
示例#17
0
文件: speech.py 项目: caoanle13/Rise
def on_message(client, userdata, message):

    message_payload = json.loads(message.payload.decode())
    message_type = message_payload['type']

    if message_type == SPEECH_TRIGGER:

        time.sleep(2)
        # speech recognition
        speech_success, text = recognize()
        if speech_success:
            print("output of speech recognition:", text)
            # natural language processing
            nlp = NLP()
            nlp_success, meaning = nlp.parse(text)
            if nlp_success:
                print("output of natural language processing:", meaning)
                if meaning == 'sunrise':
                    pi_data = json.dumps({'type': TIME_SET, 'nature': SUNRISE})
                    app_data = json.dumps({
                        'type':
                        SPEAK,
                        'say':
                        speech_messages['WAKEUP_SUNRISE']
                    })
                else:
                    pi_data = json.dumps({
                        'type': TIME_SET,
                        'nature': AT,
                        'time': meaning
                    })
                    hour = str(int(meaning.split(' ')[1].split(':')[0]))
                    if hour == '0':
                        hour = 'midnight'
                    minute = str(int(meaning.split(' ')[1].split(':')[1]))
                    if minute == '0':
                        minute = ''
                    app_data = json.dumps({
                        'type':
                        SPEAK,
                        'say':
                        speech_messages['WAKEUP_TIME'] + str(hour) + ' ' +
                        str(minute)
                    })
                client.publish(TO_PI, pi_data)
                client.publish(TO_APP, app_data)
示例#18
0
 def analyze(self, lyrics_result, stats_result, analyze_threshold):
     if not self.data['lyrics']:
         self.data['lyrics'] = self.parse_lyrics(lyrics_result)
     self.data['stats'] = self.parse_stats(stats_result)
     artists_to_analyze = self.divide_artists_to_analyze()
     nlp = NLP(artists=artists_to_analyze,
               analyze_threshold=analyze_threshold)
     try:
         results = nlp.start()
     except Exception as e:
         print(
             'Error: To be able to analyze lyrics, start Zemberek app with `StartGrpcServer` parameter!\n\n'
         )
     else:
         processed_stats = nlp.process_stats(results)
         for stat in processed_stats:
             self.data['stats'].append(stat)
         self.save('stats', stats_result)
         print('Analyzes saved!')
示例#19
0
def call_count_unique(arg_json, arg_unique, arg_nlp):
    examples = LabeledExample.read(arg_json)
    templates = list()
    for index in arg_unique:
        example = examples[index]
        natural_language = NLP.read(arg_nlp, index)
        wp = WordProblem(example, natural_language)
        templates.append(wp.extract_template())

    print(len(set(templates)))
    print(json.dumps([t.to_json() for t in templates]))
示例#20
0
文件: Main.py 项目: dchar18/TORi
    def __init__(self, *args, **kwargs):
        self.nlp_helper = NLP()
        self.recognizer = sr.Recognizer()
        self.microphone = sr.Microphone()

        self.label_text = ""

        Page.__init__(self, *args, **kwargs)
        frame = tk.Frame(self, bg="gray")
        frame.pack(fill="x", expand=False)
        self.label = tk.Label(frame, text=self.label_text, font=LARGE_FONT)
        button = tk.Button(frame,
                           text="Main View",
                           fg="white",
                           bg="gray",
                           width=15,
                           height=7,
                           command=lambda: self.button_action)
        self.label.place(in_=frame, x=0, y=0, relwidth=1, relheight=1)
        self.label.pack(side="top", fill="both", expand=True)
        button.pack(side="bottom")
示例#21
0
def word_count_similarity(path, speech_type, speech_id):
    """
        1文同士の類似度を, 1文の単語数に着目して計算する
        non-similarity = t番目の文の単語数 / (t-1)番目の文の単語数
        non-similarityをsin波のように出力する
        non-similarityの値が大きいほど類似度は低い
    """
    file_path = f"{path}/{speech_type}/{speech_id}.txt"
    sentences = preprocess(file_path)
    nlp = NLP()
    tokens_list = [nlp.rm(nlp.tokens(s)) for s in sentences]
    word_counts = [len(token) for token in tokens_list]
    if 0 in word_counts:
        word_counts.remove(0)
    similarities = [0.0]
    for idx in range(len(word_counts) - 1):
        if int(word_counts[idx]) != 0:
            sim = round(int(word_counts[idx]) / int(word_counts[idx - 1]), 2)
            similarities.append(sim)

    return similarities
示例#22
0
class TestNLP(TestCase):
    def setUp(self):
        self.NLP = NLP()

    def test_translate_whitespace(self):
        original = '   look    hobo    '
        test = 'look hobo'
        translated = self.NLP.translate(original)
        self.assertEqual(test, translated)

    def test_eat_mushroom(self):
        original = 'eat mushroom'
        translated = self.NLP.translate(original)
        self.assertEqual(translated, translated)

    def test_translate_whitespace_2(self):
        original = 'look  hobo'
        test = 'look hobo'
        translated = self.NLP.translate(original)
        self.assertEqual(test, translated)

    def test_translate_verbs(self):
        original = '   view    hobo    '
        test = 'look hobo'
        translated = self.NLP.translate(original)
        self.assertEqual(test, translated)

    def test_translate_verbs_2(self):
        original = 'view dog'
        test = 'look dog'
        translated = self.NLP.translate(original)
        self.assertEqual(test, translated)

    def test_translate_2(self):
        original = 'eat hobo'
        test = 'eat hobo'
        translated = self.NLP.translate(original)
        self.assertEqual(test, translated)

    def test_remove_preps(self):
        original = '   view the about    hobo    '
        test = 'look hobo'
        translated = self.NLP.translate(original)
        self.assertEqual(test, translated)

    def test_remove_articles(self):
        original = '   view a albatross    '
        test = 'look albatross'
        translated = self.NLP.translate(original)
        self.assertEqual(test, translated)
示例#23
0
    def get_dict(features, labels, percentage_words=0.1, iterations=50):
        real_features = features.copy()

        words_label = USES_MULTI.words_label(features, labels)

        number_words = len(words_label)
        number_selected_words = int(round(number_words * percentage_words))

        distinct_labels = list(set(labels))
        number_selected_words_label = int(number_selected_words /
                                          len(distinct_labels))

        features_score = USES_MULTI.feature_score(words_label, labels)

        candidates = {}

        for label in features_score:
            positive_candidates = features_score[
                label][:number_selected_words_label]
            candidates.update(positive_candidates)
            negative_candidates = features_score[label][
                -number_selected_words_label:]
            candidates.update(negative_candidates)

        results = {}
        actual_iteration = 0

        while (actual_iteration < iterations):
            actual_candidates = USES_MULTI.random_items(
                candidates, number_selected_words)
            filtered_features = NLP.filter_features(features,
                                                    actual_candidates)
            try:
                #pprint(actual_iteration)
                FMeasure = USES_MULTI.classifier(filtered_features, labels)
                #pprint(FMeasure)
            except Exception as e:
                #pprint(e)
                FMeasure = 0

            results.update({FMeasure: actual_candidates})

            actual_iteration = actual_iteration + 1

        results = sorted(results.items(), key=lambda x: x[0], reverse=True)

        best_FM = results[0][0]
        dict_words = results[0][1]

        #print('Best: ', best_FM)

        return dict_words
示例#24
0
def call_print(arg_json, arg_index, arg_nlp):
    examples = LabeledExample.read(arg_json)
    example = examples[arg_index]
    natural_language = NLP.read(arg_nlp, arg_index)
    wp = WordProblem(example, natural_language)
    wp.extract_template()
    print(wp)
    print('questions: {}'
          .format([(s.as_text(), s.object_of_sentence())
                   for s in wp.nlp.questions().itervalues()]))
    print('commands: {}'
          .format([(s.as_text(), s.object_of_sentence())
                   for s in wp.nlp.commands().itervalues()]))
示例#25
0
文件: Main.py 项目: dchar18/TORi
class MainView(Page):
    label_text = ""

    def __init__(self, *args, **kwargs):
        self.nlp_helper = NLP()
        self.recognizer = sr.Recognizer()
        self.microphone = sr.Microphone()

        self.label_text = ""

        Page.__init__(self, *args, **kwargs)
        frame = tk.Frame(self, bg="gray")
        frame.pack(fill="x", expand=False)
        self.label = tk.Label(frame, text=self.label_text, font=LARGE_FONT)
        button = tk.Button(frame,
                           text="Main View",
                           fg="white",
                           bg="gray",
                           width=15,
                           height=7,
                           command=lambda: self.button_action)
        self.label.place(in_=frame, x=0, y=0, relwidth=1, relheight=1)
        self.label.pack(side="top", fill="both", expand=True)
        button.pack(side="bottom")

    def button_action(self):
        self.label_text = "Listening..."
        self.label.config(text=self.label_text)

        input = self.nlp_helper.listen(self.recognizer, self.microphone)
        if input != "Nothing transcribed":
            result = self.nlp_helper.predict_class(input)
            self.label_text = result
            self.label.config(text=self.label_text)
        else:
            self.label_text = input
            self.label.config(text=self.label_text)
示例#26
0
 def run(self, problem):
     nlp = NLP(fg=problem.fg,
               x0=problem.var_init,
               h=problem.h,
               lb=problem.var_bounds_l,
               ub=problem.var_bounds_u)
     solver = Solver(ftol=self.ftol,
                     gtol=self.gtol,
                     maxiter=self.maxiter,
                     verbose=False,
                     record=False)
     res = solver.solve(nlp)
     res.H = None  # Memory issues ;)
     res.g = None
     return res
示例#27
0
        def getKey(content):

            tokens = NLP.getMorphology(content)

            if tokens is None:
                return content

            for token in tokens:

                if token is None:
                    break

                if u'名词' == token['wtype']:
                    return token['word']

            return content
示例#28
0
def call_fold(arg_testfold, arg_numfolds, arg_foldoutput,
              arg_json, arg_nlp, arg_templates, arg_parameters):
    examples = LabeledExample.read(arg_json)
    indices = [e.index for e in examples.itervalues()][:5]  # TODO just 5 for testing
    natural_language = {i: NLP.read(arg_nlp, i) for i in indices}
    word_problems = [WordProblem(examples[i], natural_language[i])
                     for i in indices]

    fold_indices = make_fold_indices(arg_numfolds, len(word_problems))
    test_indices = fold_indices.pop(arg_testfold)
    train_indices = list()
    for per_fold in fold_indices:
        train_indices.extend(per_fold)

    with open(arg_templates, 'rt') as f_handle:
        raw = f_handle.read()

    parsed = json.loads(raw)
    unique_templates = [Template.from_json(j) for j in parsed['templates']]
    wp_template_map = {int(k): v
                       for k, v in parsed['wp_template_map'].iteritems()}

    train_wps = [word_problems[i] for i in train_indices]
    train_templates_indices = list({wp_template_map[wp.labeled_example.index]
                                    for wp in train_wps})
    remap_templates = {wp.labeled_example.index:
                       train_templates_indices.index(
                           wp_template_map[wp.labeled_example.index])
                       for wp in train_wps}
    train_templates = [unique_templates[i] for i in train_templates_indices]

    feature_extractor = FeatureExtractor(train_templates, train_wps)
    classifier = optimize_parameters(feature_extractor, train_wps,
                                     train_templates, remap_templates)
    with open(arg_parameters, 'wt') as f_handle:
        f_handle.write(json.dumps(classifier.to_json()))

    correct = 0
    for test_i in test_indices:
        test_wp = word_problems[test_i]
        correct += classifier.solve(test_wp)
    print('{} correct out of {}'.format(correct, len(test_indices)))
示例#29
0
        def isMatched(title, key):

            keywords = NLP.getKeywords(title)

            if keywords is None:
                # XXX: Should it return true?
                return True

            for keyword in keywords:

                if keyword is None:
                    # XXX: Should it return true?
                    return True

                if key in keyword['keyword']:
                    return True

            print '"', title, '" doesn\'t match "', key, '"'

            return False
示例#30
0
def main():
    last_update_id = None
    if not os.path.isfile("proc_data/proc_data.pkl"):
        preproc()
    if not os.path.isfile("model/encoder.h5"):
        preproc()
    df = pd.read_pickle("proc_data/proc_data.pkl")
    encoder = load_model("model/encoder.h5")
    with open("word_vectors/word_indx.csv", 'rb') as outfile:
        word_index = pickle.load(outfile)
    with open("word_vectors/embedding_matrix.csv", 'rb') as outfile:
        embedding_matrix = pickle.load(outfile)
    nlp = NLP(df, encoder, word_index, embedding_matrix, 200, 200)
    print("The bot is now active")
    while True:
        updates = get_updates(last_update_id)
        if len(updates["result"]) > 0:
            last_update_id = get_last_update_id(updates) + 1
            handle_updates(updates, nlp)
        time.sleep(0.5)
示例#31
0
    def get_dict(features, labels, max_words=5):
        real_features = features.copy()
        real_labels = labels.copy()

        words_label = USES_MULTI.words_label(features, labels)
        number_words = len(words_label)

        if (number_words < max_words):
            max_words = number_words

        features_score = ALTER_USES.feature_score(words_label, labels)

        results = {}
        number_words = 1

        while (number_words <= max_words):
            dict_words = ALTER_USES.build_dict(features, labels,
                                               features_score, number_words)

            filtered_features = NLP.filter_features(features, dict_words)

            try:
                #pprint(number_words)
                FMeasure = USES_MULTI.classifier(filtered_features, labels)
                #pprint(FMeasure)
            except Exception as e:
                #pprint(e)
                FMeasure = 0

            results.update({FMeasure: dict_words})

            number_words = number_words + 1

        results = sorted(results.items(), key=lambda x: x[0], reverse=True)

        best_FM = results[0][0]
        dict_words = results[0][1]

        #print('Best FM: ', best_FM)

        return dict_words
示例#32
0
	def execute(self, string):
		#print(queries.index(string))
		a = NLP(string)
		a.namedEntityRecognition()
		a.replaceContractions()
		a.lemmatize()

		print("Lemmatized query:", a.lowercase_query)
		a.tokenize()
		a.removePunctAndStop()
		a.replaceRelations()
		a.replaceAttr()
		a.reconstruct()
		a.replaceOperators()
		a.replaceSynAttr()
		a.replaceSynCommon()
		#print (a.lowercase_query)
		a.andOr()
		a.unknownAttr()
		a.relationSearch()
		a.negationCheck()
		a.removeDuplicates()
		a.cleaningSelectList()
		#print(a.SELECT)
		#print(a.WHERE)
		#print("Unique relation ",a.unique_attribute_relation)
		#print("Common relation ",a.common_attribute_relation)
		b = QueryConstruction(a.SELECT, a.WHERE, a.unique_attribute_relation, a.common_attribute_relation)

		b.checkJoin()
		b.constructSelectPart()
		check = b.constructFromPart()
		if check is True:
			b.constructWherePart()

		#print(b.final_query)
		return b.final_query
import gensim
import pandas as pd
from nlp import NLP
from dataset_utils import *
import jieba

combined_dir = '../dataset/combined/'

assembled_combined_csv = pd.read_csv(combined_dir + 'assembled_combined.csv', quoting=3)
train_dataset, test_dataset = NLP.divide_train_test(assembled_combined_csv, '2016-02-01')

# date and label list




class NLPWithGensim:
    @classmethod
    def documents_to_tfidf_corpus(cls, documents):
        news_data = documents.values
        tokenized_news_list = []
        for daily_news in news_data:
            filtered_daily_news = []
            filtered_daily_news = [x for x in daily_news if str(type(x)).find('float') == -1]

            filtered_daily_news = ' '.join(filtered_daily_news)
            filtered_daily_news = list(jieba.cut(filtered_daily_news))
            tokenized_news_list.append(filtered_daily_news)
        dic = gensim.corpora.Dictionary(tokenized_news_list)
        corpus = [dic.doc2bow(text) for text in tokenized_news_list]
        tfidf = gensim.models.TfidfModel(corpus)
示例#34
0
class TweetAnalyser(BaseAnalyser):

    def __init__(self):
        self.nlp = NLP()
        self.txt_utils = TextUtils()
        self.usr_words = {}     # to temporary store users with tagged words
        self.users = {}
        self.result = None

    def add_users(self, users):
        for u in users:
            # store tagged tweets of the current user
            if u.tweets:
                self.users[u.id] = u.screen_name
                self.usr_words[u.id] = self.get_tagged_words(u.tweets, ['N'])

    def get_tagged_words(self, tweets, tags):
        """ Return a list of all the tagged words of tweets
            tweets:     list of TwitterMessage objects
                        [TwitterMessage, TwitterMessage]
            tags :      list with tag that must be filtered on
                        ['N', 'V']
        """
        words = []
        for tweet in tweets:
            tagged_tweet = self.tag_tweet(tweet)
            if tagged_tweet:
                for tagged_word in tagged_tweet:
                    if tagged_word[1] in tags:
                        words.append(tagged_word)
        return words

    def tag_tweet(self, tweet):
        if self.nlp.detect_language(tweet.text):
            return self.nlp.tag(self.txt_utils.tokenize(tweet.text), self.nlp.detect_language(tweet.text))
        return None

    def analyse(self):
        word_count = {}
        word_users = {} # contains a list of tuples (usr, timesused)

        data = {}

        # Build word_count and word_users
        data['message'] = "Building word_count and word_users"
        self.notifyAll(data)
        for u_id in self.usr_words:
            word_fd = nltk.FreqDist(word for (word, tag) in self.usr_words[u_id])
            for word in word_fd:
                if word_count.has_key(word):
                    word_count[word] += word_fd[word]
                    word_users[word].append((u_id, word_fd[word]))
                else:
                    word_count[word] = word_fd[word]
                    word_users[word] = [(u_id, word_fd[word])]

        data['message'] = "Filtering word_count and word_users on the times a word is used by a user in comparison with the other users of this word."
        self.notifyAll(data)
        # Filter the users of a word on the times a word is used
        for word in word_count:
            avg_usg = word_count[word]/float(len(word_users[word]))
            lower_limit = avg_usg - 0.25 * avg_usg
            for i, user in enumerate(word_users[word]):
                if user[1] < lower_limit:
                    word_count[word] -= user[1]


        clusters = []
        words = word_count.keys()
        data['message'] = "Comparing word users and clusters words and users if the group of users of both word match enough with each other."
        self.notifyAll(data)
        # Compare word_users and if they are similar combine to a clusters
        for i in range(len(words)):
            cluster_words = [words[i]]
            cluster_users = []
            users_a = [user[0] for user in word_users[words[i]]]
            cluster_users.extend(users_a)

            # Now compare the users of word[i] with the users of all other words
            for j in range((i+1), len(words)):
                users_b = [user[0] for user in word_users[words[j]]]
                intersect_len = len(set(users_a).intersection(set(users_b)))

                # Check if user groups of two words are very similar
                if intersect_len < 0.75 * len(users_a) or intersect_len < 0.75 * len(users_b):
                    continue
                # They are very similar
                cluster_words.append(words[j])
                cluster_users = set(cluster_users).union(users_b)

            # We don't want clusters with one word, and especially not clusters with one user.
            if len(cluster_users) == 1 or len(cluster_words) == 1:
                continue

            # Check if the cluster is not a subcluster (of words) of a previous cluster. Then we can skip the subcluster
            in_previous_cluster = False
            for cluster in clusters:
                if len(set(cluster_words)) == len(set(cluster[0]).intersection(set(cluster_words))):
                    in_previous_cluster = True
                    break
            if in_previous_cluster:
                continue

            # Everything ok, so replace user ids with screennames in result
            screennames = []
            for user in cluster_users:
                screennames.append("@" + self.users[user])

            # Save found cluster
            data['message'] = self.cluster_to_string((cluster_words, screennames))
            self.notifyAll(data)
            clusters.append((cluster_words, screennames))

        # Sort by users per cluster
        sorted_clusters = sorted(clusters, cmp=lambda x,y: cmp(len(x[1]), len(y[1])), reverse=True)
        self.result = sorted_clusters
        return sorted_clusters

    def cluster_to_string(self, cluster):
        r = ""
        for word in cluster[0]:
            r+= " " + word
        r+="\n"

        for username in cluster[1]:
            r += " " + str(username)
        r+="\n"
        return r

    def result_to_string(self):
        """Returns a printable version of the results"""
        r = ""
        for cluster in self.result:
            r += self.cluster_to_string(cluster) + "\n"
        return r
示例#35
0
 def __init__(self):
     self.nlp = NLP()
     self.txt_utils = TextUtils()
     self.usr_words = {}     # to temporary store users with tagged words
     self.users = {}
     self.result = None