Python is_valid_url 예제들, supportFunctions.is_valid_url Python 예제들

예제 #1

0

파일 보기

파일: poet0Jaccard.py 프로젝트: JonathanReeve/trendhaikubot

def writehaiku(trend, tweets):

    # Print preamble
    # print "Poet0: "

    # Create list of words in tweets
    allWords = []
    for tweet in tweets:
        allWords.extend(tweet.text.split())

    invalidWords = []
    for word in allWords:
        # remove URLs and twitter users and hashtags
        if sf.is_valid_url(word) or sf.is_twitter_user(word) or sf.is_hashtag(word):
            invalidWords.append(word)
        # remove non-alpha words
        if not word.isalpha():
            invalidWords.append(word)
        # remove words fewer than four characters or more than 25
        if len(word) < 4 or len(word) > 25:
            invalidWords.append(word)

    filteredWords = [word for word in allWords if word not in invalidWords]

    # logging.debug("Filtered wordlist is now: ")
    # logging.debug(filteredWords)

    # Get the list of unique words with their counts
    uniqueWords = Counter(filteredWords)

    # Get the most common words
    topWords = uniqueWords.most_common(5)

    # For top common filtered words, get phrases of length 5 containing them
    phrases = []
    for n in range(2, 7):
        for word in topWords:
            idx = n - 1
            try:
                while filteredWords[idx : len(filteredWords) - n].index(word[0]) >= 0:
                    idx = filteredWords[idx : len(filteredWords) - n].index(word[0]) + idx
                    for i in range(0, n):
                        phrases.append(" ".join(filteredWords[(idx - i) : (idx - i + n)]))
                    idx += 1
            except:
                idx = 0

    uniquePhrases = Counter(phrases)
    topPhrases = uniquePhrases.most_common(200)

    # Compute the syllable length for each phrase
    listPhrases = [list(phrase) for phrase in topPhrases]
    for phrase in listPhrases:
        phrase.append(sf.nsyllables(phrase[0]))

    # Use Jaccard similarity to choose top-tweeted phrases that are not similar to one another
    Phrase1 = ""
    Phrase2 = ""
    Phrase3 = ""

    # Populate 7-syllable phrase first with the top-tweeted phrase of five syllables
    for phrase in listPhrases:
        if phrase[2] == 7 and Phrase1 == "":
            Phrase2 = phrase[0]
            break

    # Get list of 5-syllable phrases and compute their Jaccard similarities from the 2nd
    # Choose the one that is popular with smaller Jaccard similarity
    if Phrase2 != "":
        for phrase in listPhrases:
            if phrase[2] == 5 and d.jaccard(Phrase2, phrase[0]) >= 0.4:
                if Phrase1 == "":
                    Phrase1 = phrase[0]

    # Get list of 5-syllable phrases and compute their Jaccard similarities from the 1st and 2nd
    # Choose the one that is popular with smaller Jaccard similarity
    if Phrase2 != "" and Phrase1 != "":
        for phrase in listPhrases:
            if phrase[2] == 5 and d.jaccard(Phrase2, phrase[0]) >= 0.6 and d.jaccard(Phrase1, phrase[0]) >= 0.6:
                if Phrase3 == "":
                    Phrase3 = phrase[0]

    myHaiku = h.Haiku()

    # Construct the haiku
    if Phrase1 != "" and Phrase2 != "" and Phrase3 != "":
        myHaiku.length = len(Phrase1) + len(Phrase2) + len(Phrase3)
        myHaiku.text = [Phrase1, Phrase2, Phrase3]
        # return [[Phrase1, Phrase2, Phrase3], len(Phrase1) + len(Phrase2) + len(Phrase3)]

    return myHaiku

예제 #2

0

파일 보기

파일: poet0.py 프로젝트: JonathanReeve/trendhaikubot

def writehaiku(trend, tweets):

    # Print preamble
    # print "Poet0: "

    # Create list of words in tweets
    allWords = []
    for tweet in tweets:
        allWords.extend(tweet.text.split())

    invalidWords = []
    for word in allWords:
        # remove URLs and twitter users and hashtags
        if sf.is_valid_url(word) or sf.is_twitter_user(word) or sf.is_hashtag(word):
            invalidWords.append(word)
        # remove non-alpha words
        if not word.isalpha():
            invalidWords.append(word)
        # remove words fewer than four characters or more than 25
        if len(word) < 4 or len(word) > 25:
            invalidWords.append(word)

    filteredWords = [word for word in allWords if word not in invalidWords]

    # logging.debug("Filtered wordlist is now: ")
    # logging.debug(filteredWords)

    # Get the list of unique words with their counts
    uniqueWords = Counter(filteredWords)

    # Get the most common words
    topWords = uniqueWords.most_common(5)

    # For top common filtered words, get phrases of length 5 containing them
    phrases = []
    for n in range(2, 7):
        for word in topWords:
            idx = n - 1
            try:
                while filteredWords[idx : len(filteredWords) - n].index(word[0]) >= 0:
                    idx = filteredWords[idx : len(filteredWords) - n].index(word[0]) + idx
                    for i in range(0, n):
                        phrases.append(" ".join(filteredWords[(idx - i) : (idx - i + n)]))
                    idx += 1
            except:
                idx = 0

    uniquePhrases = Counter(phrases)
    topPhrases = uniquePhrases.most_common(200)

    # Compute the syllable length for each phrase
    listPhrases = [list(phrase) for phrase in topPhrases]
    for phrase in listPhrases:
        phrase.append(sf.nsyllables(phrase[0]))

    # Select the two most commonly tweeted phrases with 5 syllables and the most commonly tweeted phrase with 7 syllables
    Phrase1 = ""
    Phrase2 = ""
    Phrase3 = ""
    for phrase in listPhrases:
        if phrase[2] == 5:
            if Phrase1 == "":
                Phrase1 = phrase[0]
            else:
                if Phrase3 == "":
                    Phrase3 = phrase[0]
                    break
        if phrase[2] == 7:
            if Phrase2 == "":
                Phrase2 = phrase[0]

    myHaiku = h.Haiku()

    # Construct the haiku
    if Phrase1 != "" and Phrase2 != "" and Phrase3 != "":
        myHaiku.length = len(Phrase1) + len(Phrase2) + len(Phrase3)
        myHaiku.text = [Phrase1, Phrase2, Phrase3]
        # return [[Phrase1, Phrase2, Phrase3], len(Phrase1) + len(Phrase2) + len(Phrase3)]

    return myHaiku

예제 #3

0

파일 보기

파일: poet13.py 프로젝트: JonathanReeve/trendhaikubot

def writehaiku(trend, tweets): 

    allWords = []
    for tweet in tweets:
        allWords.extend(tweet.text.split())
    invalidWords = [] 

    for word in allWords: 
        # remove URLs and twitter users and hashtags
        if sf.is_valid_url(word) or sf.is_twitter_user(word) or sf.is_hashtag(word): 
            invalidWords.append(word) 
        # remove non-alpha words
        if not word.isalpha(): 
            invalidWords.append(word) 
        # remove words fewer than four characters or more than 25
        if len(word)<4 or len(word)>25: 
            invalidWords.append(word) 

    filteredWords = [word for word in allWords if word not in invalidWords] 

    tagged = nltk.pos_tag(filteredWords) 
    
    line1 = "" 
    for word in tagged: 
        if line1 != "": 
            continue
        else: 
            if sf.nsyl(word[0]) == 2 and word[1]=='NN': 
                if word[0][0] in ['a', 'e', 'i', 'o', 'u', 'y']:  
                    line1 = "I am an "+word[0]+"."
                else: 
                    line1 = "I am a "+word[0]+"."
    
    word2 = "" 
    for word in tagged: 
        if word2 != "": 
            continue
        else: 
            if sf.nsyl(word[0]) == 3 and word[1]=='NNP': 
                word2 = word[0]

    word3 = ""
    for word in tagged: 
        if word3 != "": 
            continue
        else: 
            if sf.nsyl(word[0])== 1 and word[1]=='NN': 
                word3 = word[0]

    line2 = "and {} is my {}.".format(word2, word3) 

    line3 = ''
    for word in tagged: 
        if line3 != "": 
            continue
        else: 
            if sf.nsyl(word[0]) == 1 and word[1]=='VB': 
                line3 = "I {} with the night!".format(word[0])

    out = line1 + '\n' + line2 + '\n' + line3

    return out