示例#1
0
def test_parse():
    parse, _ = liwc.load_token_parser(os.path.join(test_dir, "alpha.dic"))
    sentence = "Any alpha a bravo charlie Bravo boy"
    tokens = sentence.split()
    matches = [category for token in tokens for category in parse(token)]
    # matching is case-sensitive, so the only matches are "alpha" (A), "a" (A) and "bravo" (Bravo)
    assert matches == ["A", "A", "Bravo"]
示例#2
0
    def process(self, dataset):
        print("Processing LIWC...")

        # LIWC loading and processing
        parse, category_names = liwc.load_token_parser(self.path)
        result = pd.DataFrame(self.count_emotions(dataset, parse))
        return result
示例#3
0
 def test_parse(self):
     parse, _ = liwc.load_token_parser('tests/alpha.dic')
     sentence = 'Any alpha a bravo charlie Bravo boy'
     tokens = sentence.split()
     matches = [category for token in tokens for category in parse(token)]
     # matching is case-sensitive, so the only matches are "alpha" (A), "a" (A) and "bravo" (Bravo)
     self.assertEqual(matches, ['A', 'A', 'Bravo'])
def get_raw_liwc_categories_for_topics(model, liwc_dictionary_path):
    parse, category_names = liwc.load_token_parser(liwc_dictionary_path)
    categories = []
    for topic in model["topics"]:
        word_categories = {}
        for word in topic:
            word_categories[word] = [category for category in parse(word)]
        categories.append(word_categories)
    return categories
def process_lines(dump: Iterable[list], stats: Mapping, users_dict: dict,
                  stats_dict: dict, fieldnames: Iterable[list],
                  args: argparse.Namespace) -> (str, Iterable[list]):
    """Assign each revision to the snapshot or snapshots to which they
       belong.
    """

    first = next(dump)
    lang = first['lang']
    if lang in liwc_dicts:
        parse, category_names = liwc.load_token_parser(liwc_dicts[lang])

        if args.filter_users == 'per-category':
            filter_fields = ['male_', 'female_', 'org_']
        elif args.filter_users == 'per-age':
            filter_fields = ['<40_', '>=40_']
        else:
            filter_fields = ['']

        for field in filter_fields:
            for category in category_names:
                fieldnames.append('{}{}'.format(field, category))
            fieldnames.append('{}total'.format(field))
            for category in category_names:
                fieldnames.append('{}{}_count'.format(field, category))

        for fieldname in fieldnames:
            stats_dict[fieldname] = 0

        valid_users = None
        if args.filter_users:
            valid_users = get_valid_users(args, lang)
            if not valid_users:
                utils.log('The file of valid users could not be found')
                return (None, None)

        process_tweet(first,
                      parse=parse,
                      category_names=category_names,
                      stats_dict=stats_dict,
                      users_dict=users_dict,
                      valid_users=valid_users,
                      stats=stats,
                      args=args)
        for raw_obj in dump:
            process_tweet(raw_obj,
                          parse=parse,
                          category_names=category_names,
                          stats_dict=stats_dict,
                          users_dict=users_dict,
                          valid_users=valid_users,
                          stats=stats,
                          args=args)
        return (lang, category_names)
    else:
        return (None, None)
def get_liwc_features(essay):
    parse, category_names = liwc.load_token_parser('ML/resources/LIWC2007_Portugues_win.dic.txt')
    counter = Counter({x: 0 for x in category_names})
    tokens = tokenize(essay)
    counter.update(category for token in tokens for category in parse(token))
    dic = {0: counter}
    liwc_df = pd.DataFrame.from_dict(dic, orient='index').fillna(0)
    sc = StandardScaler()
    liwc_features = sc.fit_transform(liwc_df)

    return liwc_features
示例#7
0
文件: _utils.py 项目: USC-CSSL/NTAP
    def load(self, dic_path):
        lexicon, _ = liwc.read_dic(dic_path)

        cat2lexicon = defaultdict(list)

        for word, categories in lexicon.items():
            for c in categories:
                if word not in cat2lexicon[c]:
                    cat2lexicon[c].append(word)

        self.lexicon = cat2lexicon
        return liwc.load_token_parser(dic_path)
def get_liwc_categories_for_topics(model,
                                   liwc_dictionary_path,
                                   normalize=False):
    parse, category_names = liwc.load_token_parser(liwc_dictionary_path)
    topic_words = get_all_topic_words(model)
    categories = [
        category for token in topic_words for category in parse(token)
    ]
    counts = Counter(categories)
    no_of_words = len(topic_words)
    return sort_counts(counts if normalize is False else dict([(
        key, counts[key] / no_of_words) for key in counts.keys()]))
def get_categories_for_text(text, liwc_dictionary_path):
    parse, category_names = liwc.load_token_parser(liwc_dictionary_path)

    def tokenize(text):
        # you may want to use a smarter tokenizer
        for match in re.finditer(r'\w+', text, re.UNICODE):
            yield match.group(0)

    tokens = tokenize(text.lower())

    categories = [category for token in tokens for category in parse(token)]
    counts = Counter(categories)

    return categories, counts
示例#10
0
def liwc_parse(textstring):
    parse, cat_names = liwc.load_token_parser(LIWC_dictionary)

    tokens = tokenize(textstring)

    rawtext_counts = Counter(category for token in tokens
                             for category in parse(token))

    fpa = rawtext_counts['focuspast']
    fpr = rawtext_counts['focuspresent']
    ffu = rawtext_counts['focusfuture']
    fto = fpa + fpr + ffu
    if fto > 0:
        return fpa, fpr, ffu, fpa / fto, fpr / fto, ffu / fto
    else:
        return fpa, fpr, ffu, 0, 0, 0
def process_lines(dump: Iterable[list], stats: Mapping, users_dict: dict,
                  stats_dict: dict, fieldnames: Iterable[list],
                  args: argparse.Namespace) -> (str, Iterable[list]):
    """Assign each revision to the snapshot or snapshots to which they
       belong.
    """

    first = next(dump)
    lang = first['lang']
    if lang in liwc_dicts:
        parse, category_names = liwc.load_token_parser(liwc_dicts[lang])

        for category in category_names:
            fieldnames.append(category)

        fieldnames.append('total')

        for category in category_names:
            fieldnames.append('{}_count'.format(category))

        valid_users = get_valid_users(args, lang)
        if not valid_users:
            utils.log('The file of valid users could not be found')
            return (None, None)

        process_tweet(first,
                      parse=parse,
                      category_names=category_names,
                      fieldnames=fieldnames,
                      stats_dict=stats_dict,
                      users_dict=users_dict,
                      valid_users=valid_users,
                      stats=stats,
                      args=args)
        for raw_obj in dump:
            process_tweet(raw_obj,
                          parse=parse,
                          category_names=category_names,
                          fieldnames=fieldnames,
                          stats_dict=stats_dict,
                          users_dict=users_dict,
                          valid_users=valid_users,
                          stats=stats,
                          args=args)
        return (lang, category_names)
    else:
        return (None, None)
def LIWC_features(df, colname, dictionary):
    """
    Adds LIWC features to a given dataframe
    Use 'LIWC2007_English100131.dic' file
    """
    parse, category_names = liwc.load_token_parser(dictionary)
    tknzr = TweetTokenizer()

    for i in range(len(category_names)):
        df[category_names[i]] = 0

    for i in range(df.shape[0]):
        t = tknzr.tokenize(df.loc[i, colname])
        features_counts = Counter(category for token in t
                                  for category in parse(token))
        for key, value in features_counts.items():
            df.loc[i, key] = value
示例#13
0
def calculate_file_score(extracted_docs_fp, csv_path):
    """
    writes liwc scores of different files in a csv file
    :return:
    """
    print(extracted_docs_fp)
    parse, _ = liwc.load_token_parser('../data/LIWC2015Dictionary.dic')
    with open(csv_path, 'w') as liwc_csv_results:
        writer = csv.writer(liwc_csv_results)
        for row_number, file in enumerate(listdir(extracted_docs_fp)):
            row = [file]
            with open(path.join(extracted_docs_fp, file)) as document:
                doc_str = document.read()
                doc_str = preprocess_gov2(doc_str)
                female_bias, male_bias = calculate_doc_score(doc_str, parse)
                row.append(female_bias)
                row.append(male_bias)
                writer.writerow(row)
示例#14
0
def determineCategories(usr):
    parse = None
    totalWords = 0
    UsedWords = 0
    parse, category_names = liwc.load_token_parser(
        'src/main/LIWC/LIWC_Turkish.dic')
    gettysburg_counts = None
    tweets = usr['preprocessedTweets']
    #print(tweets)
    for tweet in tweets:
        #print(tweet)
        words = tweet.split(" ")
        for eachWord in words:  ## each word in a tweet
            totalWords = totalWords + 1
            optionalWords = eachWord.split("|")
            for eachOptionalWord in optionalWords:  #each options of a word. It will run till find a option which belongs to any category.
                category_tokens = tokenize(
                    eachOptionalWord)  #categories of the option
                gettysburg_counts = Counter(category
                                            for token in category_tokens
                                            for category in parse(token))
                if len(gettysburg_counts) > 0:
                    for cat in gettysburg_counts.items():
                        addCategory(cat)
                    UsedWords += 1
                    break
    catList = list()
    sum = 0
    for cat in sorted(categoryCounts.keys()):
        #print(cat,":",str(categoryCounts[cat]))
        sum = sum + categoryCounts[cat]
    print("sum: " + str(sum))
    for cat in sorted(categoryCounts.keys()):
        normalized = categoryCounts[cat] / sum
        catList.append(cat + "," + str(("%.3f" % normalized)))

    updateDoc = {"username": sys.argv[1]}
    if usr is not None:
        liwcGroups = {"$set": {"groups": catList}}
        doc1 = col_User.update_one(updateDoc, liwcGroups)
        print("Updated User: "******"There is no user as " + sys.argv[1])
def process_lines(dump: Iterable[list], stats: Mapping, words_dict: dict,
                  args: argparse.Namespace) -> str:
    """Assign each revision to the snapshot or snapshots to which they
       belong.
    """
    first = next(dump)
    lang = first['lang']

    valid_users = None

    if args.users_file:
        utils.log('Specified a set of users to filter the tweet')
        valid_users = get_valid_users(args)
        if not valid_users:
            utils.log('The file of valid users could not be found\n')
            return None

    if lang in liwc_dicts:
        parse, category_names = liwc.load_token_parser(liwc_dicts[lang])

        for category in category_names:
            words_dict[category] = {}

        words_dict['words'] = 0
        words_dict['tweets'] = 0

        process_tweet(first,
                      parse=parse,
                      stats=stats,
                      words_dict=words_dict,
                      valid_users=valid_users,
                      args=args)
        for raw_obj in dump:
            process_tweet(raw_obj,
                          parse=parse,
                          stats=stats,
                          words_dict=words_dict,
                          valid_users=valid_users,
                          args=args)
        return lang
    else:
        return None
示例#16
0
def compute_liwc_from_dict(df, col):
  parse, category_names = liwc.load_token_parser('./LIWC2015_English.dic') #path of LIWC dictionary

  frames=[]
  for text in df[col]:
    print(text)
    text_tokens = tokenize(text)
    print(text_tokens)
    text_counts = Counter(category for token in text_tokens for category in parse(token))
    print(text_counts)

    liwc_value_dic = {}
    for k,v in text_counts.items():
      liwc_value_dic['news_title'] = text
      word_count = len([word for word in text.split(' ')])
      liwc_value_dic['WC'] = word_count
      liwc_value_dic['WPS'] = sum([len(sent.split(' ')) for sent in sent_tokenize(text)])/len(sent_tokenize(text))
      liwc_value_dic[k.split(",")[0].split(' ')[0]] = (v/word_count)*100
    frames.append(pd.DataFrame([liwc_value_dic]))
    break
  df_liwc = pd.concat(frames)
  return df.merge(df_liwc, on=col)
示例#17
0
def process_lines(dump: Iterable[list], stats: Mapping, tweets_dict: dict,
                  args: argparse.Namespace) -> str:
    """Assign each revision to the snapshot or snapshots to which they
       belong.
    """
    first = next(dump)
    lang = first['lang']

    valid_users = None

    if args.users_file:
        utils.log('Specified a set of users to filter the tweet')
        valid_users = get_valid_users(args)
        if not valid_users:
            utils.log('The file of valid users could not be found\n')
            return None

    if args.lexicon == 'liwc' and lang in liwc_dicts:
        parse, category_names = liwc.load_token_parser(liwc_dicts[lang])
    elif args.lexicon == 'emolex' and initEmotionLexicon(lang=lang):
        ...
    else:
        return None

    process_tweet(first,
                  parse=parse,
                  stats=stats,
                  tweets_dict=tweets_dict,
                  valid_users=valid_users,
                  args=args)
    for raw_obj in dump:
        process_tweet(raw_obj,
                      parse=parse,
                      stats=stats,
                      tweets_dict=tweets_dict,
                      valid_users=valid_users,
                      args=args)
    return lang
示例#18
0
def calculate_gendered_count(input_fp):
    """

    :param document:
    :return:
    """
    parse, _ = liwc.load_token_parser('../data/LIWC2015Dictionary.dic')
    query_df = pd.read_csv(input_fp, names=["qid", "terms", "weight"])
    # df_unbiased = pd.read_csv(UNBIASED_EXPANSION_FP, names=["qid", "terms", "weight"])
    #
    qids = list(pd.unique(query_df['qid']))
    total_fm_terms = 0
    total_male_terms = 0
    for query_id in qids:
        female_bias = 0
        male_bias = 0
        query_terms = query_df[query_df["qid"] == query_id]
        query = query_terms["terms"].tolist()
        query_str = " ".join(query)
        doc_tokens = tokenize(query_str)
        # liwc_counts = Counter(category for token in doc_tokens for category in parse(token))
        categories = []
        # token_counter = 0
        for token in doc_tokens:
            token = token.lower()
            # token_counter += 1
            for category in parse(token):
                categories.append(category)
        liwc_counts = Counter(categories)
        if "female" in liwc_counts.keys():
            female_bias = liwc_counts["female"]
        if "male" in liwc_counts.keys():
            male_bias = liwc_counts["male"]
        total_fm_terms += female_bias
        total_male_terms += male_bias
    return total_fm_terms, total_male_terms
示例#19
0
import numpy as np
import re
import json
from collections import Counter
from heapq import nlargest
import pickle
import plotly.express as px
import plotly.graph_objects as go
from PIL import Image
import streamlit as st
import liwc
import altair as alt
#import spacy
#nlp = spacy.load("en_core_web_sm")

parse, category_names = liwc.load_token_parser('data/queryDictionary.dic')
def tokenize(text):
    # you may want to use a smarter tokenizer
    for match in re.finditer(r'\w+', text, re.UNICODE):
        yield match.group(0)
def parseLIWC(x):
    gettysburg_tokens = tokenize(x)
    gettysburg_counts = Counter(category for token in gettysburg_tokens for category in parse(token))
    return gettysburg_counts




def load_data():
    DATA_URL = "data/kaggle_train.csv"
    data = pd.read_csv(DATA_URL)
示例#20
0
survey1["text"] = survey1.Q65

texts = survey1.text
texts = [text.lower() for text in survey1.texts]


# %%
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    return tokens


# def tokenize(text):
#     # you may want to use a smarter tokenizer
#     for match in re.finditer(r"\w+", text, re.UNICODE):
#         yield match.group(0)

parse, category_names = liwc.load_token_parser("/Users/kylie/LIWC.dic")

# %%
tokens = tokenize(test)

from collections import Counter

counts = Counter(category for token in tokens for category in parse(token))
print(counts["function"])
# => Counter({'funct': 58, 'pronoun': 18, 'cogmech': 17, ...})

# %%
# Compare what did you hear to what did you hope to hear
import itertools
from collections import Counter

import liwc
import numpy as np

from text_features.config import Config
"""
Contains functions to compute LIWC measures. This includes proportions of words in each category in the LIWC 2007
dictionary as well as linguistic process measures computed as part of the LIWC tool (i.e. word count, etc.).
"""

config = Config()
PARSE, CAT_NAMES = liwc.load_token_parser(config.LIWC_2007_PATH)


def extract_liwc_feats(segments):
    """
    Computes LIWC features for list text segments and stores in dictionary.
    :param segments: List of text segments, where each segment is a string. Segments are used to determine what
                     words are consecutive in order to identify bigrams + trigrams.
    :return: feats_dict: Dictionary mapping feature name to value for transcript
    """
    # compute feature values
    feats_dict = {}
    segments = [s.split(" ") for s in segments]
    words = list(itertools.chain.from_iterable(segments))
    # Generate lists of all bigrams and trigrams because some are in LIWC vocabulary (.e.g "is don't know", "you know")
    bigrams = []
    trigrams = []
    for segment in segments:
示例#22
0
def psycho_naming(coords, node_size):
    """
    Perform Automated Sentiment Labeling of each coordinate from a list of MNI coordinates.

    Parameters
    ----------
    coords : list
        List of (x, y, z) tuples in voxel-space corresponding to a coordinate atlas used or
        which represent the center-of-mass of each parcellation node.
    node_size : int
        Spherical centroid node size in the case that coordinate-based centroids
        are used as ROI's for tracking.

    Returns
    -------
    labels : list
        List of string labels corresponding to each coordinate-corresponding psychological topic.

    References
    ----------
    .. [1] Tor D., W. (2011). NeuroSynth: a new platform for large-scale automated synthesis of
      human functional neuroimaging data. Frontiers in Neuroinformatics.
      https://doi.org/10.3389/conf.fninf.2011.08.00058
    .. [2] Tausczik, Y. R., & Pennebaker, J. W. (2010). The psychological meaning of words:
      LIWC and computerized text analysis methods. Journal of Language and Social Psychology.
      https://doi.org/10.1177/0261927X09351676

    """
    import liwc
    import pkg_resources
    import nimare
    import nltk
    from collections import Counter
    from nltk.corpus import sentiwordnet as swn
    from pynets.core.utils import flatten
    from nltk.stem import WordNetLemmatizer

    try:
        swn.senti_synsets('TEST')
    except:
        nltk.download('sentiwordnet')
        nltk.download('wordnet')

    with open(pkg_resources.resource_filename("pynets", "runconfig.yaml"),
              'r') as stream:
        hardcoded_params = yaml.load(stream)
        try:
            LIWC_file = hardcoded_params['sentiment_labeling']['liwc_file'][0]
        except FileNotFoundError:
            print('LIWC file not found. Check runconfig.yaml.')
        try:
            neurosynth_dset_file = hardcoded_params['sentiment_labeling'][
                'neurosynth_db'][0]
        except FileNotFoundError:
            print(
                'Neurosynth dataset .pkl file not found. Check runconfig.yaml.'
            )
    stream.close()

    try:
        dset = nimare.dataset.Dataset.load(neurosynth_dset_file)
    except FileNotFoundError:
        print('Loading neurosynth dictionary failed!')

    try:
        parse, category_names = liwc.load_token_parser(LIWC_file)
    except FileNotFoundError:
        print('Loading LIWC dictionary failed!')

    labels = []
    print('Building coordinate labels...')
    for coord in coords:
        print(coord)
        roi_ids = dset.get_studies_by_coordinate(
            np.array(coord).reshape(1, -1), node_size)
        labs = dset.get_labels(ids=roi_ids)
        labs_filt = list(
            flatten([
                list([
                    i for j in swn.senti_synsets(i)
                    if j.pos_score() > 0.75 or j.neg_score() > 0.75
                ]) for i in labs
            ]))
        st = WordNetLemmatizer()
        labs_filt = list(set([st.lemmatize(k) for k in labs_filt]))
        liwc_counts = dict(
            Counter(
                top.split(' (')[0] for token in labs_filt
                for top in parse(token)
                if (top.split(' (')[0].lower() != 'bio') and (
                    top.split(' (')[0].lower() != 'adj') and (
                        top.split(' (')[0].lower() != 'verb') and
                (top.split(' (')[0].lower() != 'conj') and (
                    top.split(' (')[0].lower() != 'adverb') and (
                        top.split(' (')[0].lower() != 'auxverb') and (
                            top.split(' (')[0].lower() != 'prep') and (
                                top.split(' (')[0].lower() != 'article') and
                (top.split(' (')[0].lower() != 'ipron') and (
                    top.split(' (')[0].lower() != 'ppron') and (
                        top.split(' (')[0].lower() != 'pronoun') and (
                            top.split(' (')[0].lower() != 'function') and (
                                top.split(' (')[0].lower() != 'affect') and (
                                    top.split(' (')[0].lower() != 'cogproc')))
        liwc_counts_ordered = dict(
            sorted(liwc_counts.items(), key=lambda x: x[1], reverse=True))

        if 'posemo' and 'negemo' in liwc_counts_ordered.keys():
            if liwc_counts_ordered['posemo'] > liwc_counts_ordered['negemo']:
                del liwc_counts_ordered['negemo']
            else:
                del liwc_counts_ordered['posemo']
        liwc_counts_ordered_ratios = {}
        for i in liwc_counts_ordered:
            liwc_counts_ordered_ratios[i] = float(
                liwc_counts_ordered[i]) / float(
                    sum(liwc_counts_ordered.values()))

        lab = ' '.join(
            map(str, [
                key + ' ' + str(np.round(100 * val, 2)) + '%'
                for key, val in liwc_counts_ordered_ratios.items()
            ]))
        print(lab)
        if len(lab) > 0:
            labels.append(lab)
        else:
            labels.append(np.nan)
        del roi_ids, labs_filt, lab, liwc_counts_ordered, liwc_counts, labs
        print('\n')

    return labels
示例#23
0
from os.path import isfile, join

import shelve
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt

import liwc
import re
from collections import Counter

LIWC_dictionary = '/home/xhta/Robot/liwc/timeori.dic'
POSP_METADATA = '/home/xhta/Robot/proj/posp/posp_metadata.csv'
POSP_CLEANDATA = '/home/xhta/Robot/proj/posp/clean'

parse, cat_names = liwc.load_token_parser(LIWC_dictionary)


def tokenize(text):
    for match in re.finditer(r'\w+', text, re.UNICODE):
        yield match.group(0)


from string import punctuation
translator = str.maketrans(' ', ' ', punctuation)
from nltk.corpus import stopwords
stoplist = set(stopwords.words('english'))
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')

import spacy
示例#24
0
    writer.writerows(csvData)
csvFile.close()

import re
from collections import Counter


def tokenize(text):
    # you may want to use a smarter tokenizer
    for match in re.finditer(r'\w+', text, re.UNICODE):
        yield match.group(0)


import liwc

parse, category_names = liwc.load_token_parser('./LIWC2015_English.dic')

for text in texts:
    gettysburg_tokens = tokenize(text)
    # now flatmap over all the categories in all of the tokens using a generator:
    gettysburg_counts = Counter(category for token in gettysburg_tokens
                                for category in parse(token))
    # and print the results:
    print(gettysburg_counts)
"""
nodomaintags = []
domaintags = []
for text in domaintext:
    t = nlp(translator(text))
    labels = [x.label_ for x in t.ents]
    print(labels)
示例#25
0
from readorsee import settings
from readorsee.data.models import InstagramUser, InstagramPost
from readorsee.data.preprocessing import NLTKTokenizer, Tokenizer
import h5py
import os
import numpy as np
import pandas as pd
import liwc
from skimage import io, color
from typing import *
from collections import Counter
from pathlib import Path

if Path(settings.PATH_TO_PT_LIWC).exists():
    parse, category_names = liwc.load_token_parser(settings.PATH_TO_PT_LIWC)
tokenizer = NLTKTokenizer()

__all__ = ["get_features"]


def get_features(
    profile: InstagramUser, period: int
) -> Tuple[Dict[str, float], Dict[str, float], Dict[str, float]]:
    posts = profile.get_posts_from_qtnre_answer_date(period)
    faces = []
    likes = []
    captions = []
    comments = []
    hue = []
    saturation = []
    value = []
示例#26
0
def age_check(path, id, id1):

    sql = "SELECT Chat_ID FROM chat WHERE (Participant1 = %s && Participant2 = %s) || (Participant1 = %s && Participant2 = %s)" % (
        id, id1, id1, id)
    mycursor.execute(sql)
    cid = mycursor.fetchone()

    subid = str(cid[0])
    file_name = path
    file_name1 = "C:\\Users\\Kripa\\Desktop\\" + subid + "1.txt"
    #f = open(file_name1,"w", encoding="utf8")
    with open(file_name, encoding="utf8") as chat:
        chat_text = chat.read()
    sr = ""
    for ch in chat_text:
        file1 = open('C:\\Users\\Kripa\\Desktop\\emo.txt',
                     'r',
                     encoding="utf8")
        Lines = file1.readlines()
        check = 0

        # Strips the newline character
        for line in Lines:
            #print(.format(count, line.strip()))
            if ch == line.strip():

                check = 1

        if check == 0:

            sr += ch
        else:
            sr += "~"

    f = open(file_name1, "w", encoding="utf8")
    f.write(sr)
    f.close()

    with open(file_name1, 'r') as in_file:
        stripped = (line.strip() for line in in_file)
        lines = (line.split(",") for line in stripped if line)

        with open('C:\\Users\\Kripa\\Desktop\\' + subid + '1.csv',
                  'w',
                  newline='') as out_file:
            writer = csv.writer(out_file)
            writer.writerow(('name', 'msg'))
            writer.writerows(lines)

    line = 0
    punct = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''

    with open("C:\\Users\\Kripa\\Desktop\\" + subid + "1.csv",
              'r') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')

        for row in csv_reader:

            if row[0] == str(id):

                line += 1
                cntt[0] += len(row[1].split())
                x = 0
                sr = ""
                for val in row[1]:
                    if val == '~':
                        cntt[5] += 1
                    if val in punct:
                        cntt[2] += 1
                    else:
                        sr += val
                    x += 1
                cntt[1] += int(x / len(row[1].split()))

                for word in sr.split():
                    with open('C:\\Users\\Kripa\\Desktop\\slangdic.csv'
                              ) as csv_file1:
                        csv_reader1 = csv.reader(csv_file1, delimiter=',')

                        for row1 in csv_reader1:

                            if word == row1[0]:

                                cntt[4] += 1

                    duplicates = []

                    for char in word:
                        ## checking whether the character have a duplicate or not
                        ## str.count(char) returns the frequency of a char in the str
                        if word.count(char) > 2:
                            ## appending to the list if it's already not present
                            if char not in duplicates:
                                duplicates.append(char)
                    cntt[3] += len(duplicates)

    cntt[0] = int(cntt[0] / line)
    cntt[1] = int(cntt[1] / line)
    cntt[2] = cntt[2] - cntt[5]

    sql = "SELECT Posts, Followers, Following FROM user WHERE User_ID=%s" % id
    mycursor.execute(sql)
    res = mycursor.fetchone()
    cntt[6] = res[0]
    cntt[7] = res[1]
    cntt[8] = res[2]

    cn = np.asarray([cntt])
    cn = cn.astype('float64')

    model = keras.models.load_model("dnn30")
    predictions = model.predict_classes(cn)
    if predictions == [[1]]:

        lst = []
        line_count = 0
        lt = 0
        with open('C:\\Users\\Kripa\\Desktop\\age_ask.csv') as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            for row in csv_reader:
                lst.append(row[0])
        with open('C:\\Users\\Kripa\\Desktop\\' + subid + '1.csv') as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            for row in csv_reader:
                #print(row[0], id1)
                line_count = line_count + 1
                if row[0] == str(id1):
                    for ele in lst:

                        if ele in row[1]:

                            lt = line_count
                            break

        range = 0
        ag_ch = 0
        ln_ct = 0
        age_final = 0
        with open('C:\\Users\\Kripa\\Desktop\\' + subid + '1.csv') as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            for row in csv_reader:
                if ln_ct >= lt:
                    ln_ct = ln_ct + 1
                    if row[0] == str(id) and range < 10:
                        range = range + 1

                        for word in row[1].split():

                            if word.isnumeric():

                                a = int(word)

                                if a >= 12 and a <= 18:
                                    age_final = a
                                ag_ch = 1
                else:
                    ln_ct = ln_ct + 1

                if ag_ch == 1:
                    break
        if age_final == 0:
            age_final = 14
        sql = "INSERT INTO monitor VALUES(%s,%s)" % (id, age_final)
        mycursor.execute(sql)
        tbl = "user_" + str(id)
        sql = "create table {0} as select Chat_ID,Participant1 as Sender from chat inner join user on user.User_ID=chat.Participant2 where User_ID=%s UNION select Chat_ID,Participant2 as Sender from chat inner join user on user.User_ID=chat.Participant1 where User_ID=%s".format(
            tbl) % (id, id)
        mycursor.execute(sql)
        sql = "alter table {0} add S1 decimal default 0, add S2 decimal default 0, add S3 decimal default 0, add S4 decimal default 0, add S5 decimal default 0, add S6 decimal default 0, add Grooming_Not varchar(3) default 'No'".format(
            tbl)
        mycursor.execute(sql)
        sql = "alter table {0} add primary key(Chat_ID), add foreign key(Sender) references user(User_ID)".format(
            tbl)
        mycursor.execute(sql)
        with open('C:\\Users\\Kripa\\Desktop\\' + subid + '1.csv') as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            line_count = 0
            for row in csv_reader:
                row[1] = row[1].lower()
                lemmatizer = WordNetLemmatizer()
                lemr = ""
                for word in row[1].split():
                    lem = (lemmatizer.lemmatize(word, pos="v"))
                    lem = (lemmatizer.lemmatize(lem))
                    lemr = lemr + lem + " "
                no_punct = ""
                for char in lemr:
                    if char not in punctuations:
                        no_punct = no_punct + char

                data = word_tokenize(no_punct)
                line_count += 1
                stopWords = set(stopwords.words('english'))
                wordsFiltered = []

                for w in data:
                    if w not in stopWords:
                        wordsFiltered.append(w)
                pred = "C:\\Users\\Kripa\\Desktop\\" + subid + "2.csv"
                with open(pred, 'a+', newline='') as out_file:
                    writer = csv.writer(out_file, delimiter=' ')
                    writer.writerow(wordsFiltered[:20])

        def tokenize(text):
            for match in re.finditer(r'\w+', text, re.UNICODE):
                yield match.group(0)

        def listtostring(s):
            str1 = " "
            return (str1.join(s))

        parse, category_names = liwc.load_token_parser(
            "C:\\Users\\Kripa\\Desktop\\bigdic.dic")
        cnt = array('i', [0, 0, 0, 0, 0, 0])
        predator = "C:\\Users\\Kripa\\Desktop\\" + subid + "2.csv"
        with open(predator) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')

            j = 0
            for row in csv_reader:

                p = row.copy()

                p1 = listtostring(p).lower()
                p_token = tokenize(p1)
                from collections import Counter
                op1 = Counter(category for token in p_token
                              for category in parse(token))
                op = dict(op1)
                l = list(op.keys())
                l.sort(reverse=True)
                if l:
                    j = l[0]

                if j == "S1":
                    cnt[0] = cnt[0] + 1
                if j == "S2":
                    cnt[1] = cnt[1] + 1
                if j == "S3":
                    cnt[2] = cnt[2] + 1
                if j == "S4":
                    cnt[3] = cnt[3] + 1
                if j == "S5":
                    cnt[4] = cnt[4] + 1
                if j == "S6":
                    cnt[5] = cnt[5] + 1

        sql = (
            "UPDATE {0} SET S1=%s, S2=%s, S3=%s, S4=%s, S5=%s, S6=%s WHERE Sender=%s"
            .format(tbl)) % (cnt[0], cnt[1], cnt[2], cnt[3], cnt[4], cnt[5],
                             id1)
        mycursor.execute(sql)
        mydb.commit()
        import svm

        svm.func(tbl, id1)

        sql = (
            "SELECT Grooming_Not from {0} WHERE Sender=%s".format(tbl)) % id1
        mycursor.execute(sql)
        ress = mycursor.fetchall()
        check = [('Yes', )]

        # Check if conversation is grooming
        if ress == check:
            mydb.commit()
            # Alert via mail
            import mail
            mail.main_func(id, id1)
        os.remove('C:\\Users\\Kripa\\Desktop\\' + subid + '2.csv')

    os.remove(file_name1)
    os.remove('C:\\Users\\Kripa\\Desktop\\' + subid + '1.csv')
示例#27
0
def test_category_names():
    _, category_names = liwc.load_token_parser(os.path.join(test_dir, "alpha.dic"))
    assert category_names == ["A", "Bravo"]
示例#28
0
import os.path

dict_path = 'C:\\Users\\Dick Sang\\Desktop\\5. Data Analytics\\3. PolyU RA\\' \
            '1. Projects\\3. Cust Value Chain Analysis\\2. LIWC\\'
working_path = "C:\\Users\\Dick Sang\\Desktop\\5. Data Analytics\\3. PolyU RA\\" \
               "1. Projects\\3. Cust Value Chain Analysis\\" \
               "1. Apple Podcast_speeches\\import files\\"
export_path = "C:\\Users\\Dick Sang\\Desktop\\5. Data Analytics\\3. PolyU RA\\"\
              "1. Projects\\3. Cust Value Chain Analysis\\"

os.chdir(working_path)
import pandas as pd

import liwc
parse, category_names = liwc.load_token_parser(dict_path +
                                               'Cust_val_chain_keywords.dic')

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import string
import re
import numpy as np
from dfply import *
from math import *
import glob

from collections import Counter

# locate the files inside the current folder
file_list = glob.glob("*.DOC")
示例#29
0
def tokenize(text):
    for match in re.finditer(r'\w+', text, re.UNICODE):
        yield match.group(0)
        
def select(cur, variable, table):
    """
    Database function to retrieve a variable
    """
    cur.execute("SELECT {v} FROM {t}".format(v = variable, t = table))
    variable = cur.fetchall()
    variable = [i[0] for i in variable]
    return variable


import liwc
parse, category_names = liwc.load_token_parser('LIWC2007_English080730.dic')


descriptions = np.array(select(cur,"DESCRIPTION", "data11"))
description_trans = np.array(select(cur,"DESCRIPTION_TRANSLATED", "data11"))

description = []


for i in range(len(descriptions)):
    if description_trans[i] == '':
        descr = descriptions[i]
    else:
        descr = description_trans[i]
    description.append(descr)
def get_categories_for_word(word, liwc_dictionary_path):
    parse, category_names = liwc.load_token_parser(liwc_dictionary_path)
    return [category for category in parse(word)]