Python CleanAndTokenize示例，CleanTokenize.CleanAndTokenize Python示例

示例#1

0

显示文件

文件： NytApiCall_ComputeVocab.py 项目： fagan2888/commentIQ

def ComputeVocabulary():
    try:
        cursor.execute("select commentBody from vocab_comments")
        n = 0
        for row in cursor:
            n = n + 1
            if n % 100 == 0:
                print n
            ct = CleanAndTokenize(row[0])
            ct = [w for w in ct if w not in stopword_list]
            stemmed_tokens = [porter.stem(t) for t in ct]
            for t in stemmed_tokens:
                if t not in doc_frequency:
                    doc_frequency[t] = 1
                else:
                    doc_frequency[t] = doc_frequency[t] + 1
        sorted_list = sorted(doc_frequency.items(),
                             key=operator.itemgetter(1),
                             reverse=True)
        # find cutoff
        unigram_cutoff = 0
        json_data = {}
        out_file = open("apidata/vocab_freq.json", "w")
        for (i, (word, word_freq)) in enumerate(sorted_list):
            if word_freq < 10:
                unigram_cutoff = i - 1
                break
            json_data[word] = word_freq
        json.dump(json_data, out_file)
        print "unigram cutoff: " + str(unigram_cutoff)
    except:
        print error_name(g_day, g_offset)
        sys.exit(1)

示例#2

0

显示文件

文件： commentIQ_features.py 项目： vcuspinera/constructiveness

def calcPersonalXPScores(comment_text):
    # comment_text = comment_text.decode("utf-8")
    # tokenizer = WhitespaceTokenizer()
    personal_xp_score = 0
    text = comment_text.lower()

    #filter out punctuations
    punctuations = string.punctuation # includes following characters: !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~
    excluded_punctuations = ["$", "%", "'"]
    for p in punctuations:
        if p not in excluded_punctuations:
            text = text.replace(p, " ")

    # tokenize it
    token_list = CleanAndTokenize(comment_text)
    text_tokens = token_list
    # comment_stemmed_tokens = [porter.stem(token) for token in token_list]
    # if the tokens are in the personal_words List then increment score
    for tok in text_tokens:
        tok_stem = porter.stem(tok)
        if tok_stem in personal_words:
            personal_xp_score = personal_xp_score + 1

    # normalize by number of tokens
    if len(text_tokens) > 0:
        personal_xp_score = float(personal_xp_score) / float(len(text_tokens))
    else:
        personal_xp_score = 0.0
    return round(personal_xp_score,3)

示例#3

0

显示文件

文件： commentIQ_features.py 项目： vcuspinera/constructiveness

def calLength(comment_text):
    token = CleanAndTokenize(comment_text)
    return len(token)

示例#4

0

显示文件

def ComputeCommentArticleRelevance(comment_text, ID, operation):

    cnx = mysql.connector.connect(user=user,
                                  password=password,
                                  host=host,
                                  database=database)
    cursor = cnx.cursor()

    if operation == 'add':
        articleID = ID
        cursor.execute("select full_text from articles where articleID = '" +
                       str(articleID) + "'")
        article_data = cursor.fetchall()
    elif operation == 'update':
        commentID = ID
        cursor.execute("select articleID from comments where commentID ='" +
                       str(commentID) + "' ")
        fetch_data = cursor.fetchall()
        if len(fetch_data) > 0:
            articleID = fetch_data[0][0]
        else:
            ArticleRelevance = 0.0
            return ArticleRelevance
        cursor.execute("select full_text from articles where articleID = '" +
                       str(articleID) + "'")
        article_data = cursor.fetchall()
    else:
        ArticleRelevance = 0.0
        return ArticleRelevance
    cnx.close

    if len(article_data) < 1:
        ArticleRelevance = 0.0
        return ArticleRelevance
    for data in article_data:
        article_text = data[0]

    comment_text = escape_string(comment_text.strip())

    # clean and tokenize the comment text and article text, also exclude the stopwords
    token_list = CleanAndTokenize(comment_text)
    token_list = [word for word in token_list if word not in stopword_list]
    comment_stemmed_tokens = [porter.stem(token) for token in token_list]
    comment_stemmed_tokens_fd = FreqDist(comment_stemmed_tokens)
    token_list = CleanAndTokenize(article_text)
    token_list = [word for word in token_list if word not in stopword_list]
    article_stemmed_tokens = [porter.stem(token) for token in token_list]
    article_stemmed_tokens_fd = FreqDist(article_stemmed_tokens)

    # now create the feature vectors for article and comment
    article_features = {}
    comment_features = {}

    # Calculate weight for each word in the comment with tf-idf
    for w in vocab_freq:
        df = vocab_freq[w]
        log_fraction = (nDocuments / df)
        if log_fraction < 1:
            log_fraction = Decimal(nDocuments) / Decimal(df)
        if w in article_stemmed_tokens:
            article_features[w] = article_stemmed_tokens_fd[w] * math.log(
                log_fraction)
        else:
            article_features[w] = 0.0
        if w in comment_stemmed_tokens:
            comment_features[w] = comment_stemmed_tokens_fd[w] * math.log(
                log_fraction)
        else:
            comment_features[w] = 0.0

    # normalize vectors
    article_features = NormalizeVector(article_features)
    comment_features = NormalizeVector(comment_features)
    comment_article_similarity = ComputeCosineSimilarity(
        article_features, comment_features)
    return comment_article_similarity

示例#5

0

显示文件

def ComputeCommentConversationalRelevance(comment_text, ID, operation):

    cnx = mysql.connector.connect(user=user,
                                  password=password,
                                  host=host,
                                  database=database)
    cursor = cnx.cursor()

    if operation == 'add':
        articleID = ID
        cursor.execute("select commentBody from comments where articleID = '" +
                       str(articleID) + "' ")
        comment_data = cursor.fetchall()
    elif operation == 'update':
        commentID = ID
        cursor.execute("select articleID from comments where commentID ='" +
                       str(commentID) + "' ")
        fetch_data = cursor.fetchall()
        if len(fetch_data) > 0:
            articleID = fetch_data[0][0]
        else:
            ConversationalRelevance = 0.0
            return ConversationalRelevance
        cursor.execute("select commentBody from comments "
                       "where articleID = '" + str(articleID) +
                       "' and commentID < '" + str(commentID) + "' ")
        comment_data = cursor.fetchall()
    else:
        ConversationalRelevance = 0.0
        return ConversationalRelevance
    cnx.close
    if len(comment_data) < 2:
        ConversationalRelevance = 0.0
        return ConversationalRelevance

    centroid_comment_stemmed_tokens = []
    centroid_comment_features = {}

    # clean and tokenize the all the comments text and also exclude the stopwords
    comment_list = list(zip(*comment_data)[0])
    for comment in comment_list:
        token_list = CleanAndTokenize(comment)
        token_list = [word for word in token_list if word not in stopword_list]
        # Update and compute the centroid
        centroid_comment_stemmed_tokens.extend(
            [porter.stem(token) for token in token_list])
    centroid_comment_stemmed_tokens_fd = FreqDist(
        centroid_comment_stemmed_tokens)

    # Calculate weight for each word in all the comments with tf-idf
    for w in vocab_freq:
        log_fraction = (nDocuments / vocab_freq[w])
        if log_fraction < 1:
            log_fraction = Decimal(nDocuments) / Decimal(vocab_freq[w])
        if w in centroid_comment_stemmed_tokens:
            centroid_comment_features[w] = centroid_comment_stemmed_tokens_fd[
                w] * math.log(log_fraction)
        else:
            centroid_comment_features[w] = 0.0

    # normalize vector
    centroid_comment_features = NormalizeVector(centroid_comment_features)

    # Now compute distance to  comment
    comment_stemmed_tokens = []
    comment_features = {}
    comment_text = escape_string(comment_text.strip())
    token_list = CleanAndTokenize(comment_text)
    token_list = [word for word in token_list if word not in stopword_list]
    comment_stemmed_tokens.extend([porter.stem(token) for token in token_list])
    comment_stemmed_tokens_fd = FreqDist(comment_stemmed_tokens)
    # Calculate weight for each word in the comment with tf-idf
    for w in vocab_freq:
        log_fraction = (nDocuments / vocab_freq[w])
        if log_fraction < 1:
            log_fraction = Decimal(nDocuments) / Decimal(vocab_freq[w])
        if w in comment_stemmed_tokens:
            comment_features[w] = comment_stemmed_tokens_fd[w] * math.log(
                log_fraction)


#                    print str(comment_features[w]) + " = " + str(comment_stemmed_tokens_fd[w]) + " * " + str(math.log(log_fraction))
        else:
            comment_features[w] = 0.0
    comment_features = NormalizeVector(comment_features)
    comment_originality = ComputeCosineSimilarity(centroid_comment_features,
                                                  comment_features)
    return comment_originality