def request_Key_Word_Classifier(submission, phrase_set): ''' This is a hand made feature set to id titles that make reddit-typical requests for help, but might not phrase the request as a question ''' text = ' '.join(summarizeText.parseStringSimple01(submission.title)) # Pass phrase_set through string parser and back, it'll help? #phrase_set = botHelperFunctions.load_autoreply_key_phrases(fl_path='misc/autoreplyKeyPhrases.txt') ''' phrase_set = ['need help', '[help]', '[ help ]', '[question]', '[ question ]', 'noob ', 'n00b ', ' newb','please help', 'noobie question', 'help!', 'help me', "isn't working", 'not working', 'issues with', 'issue with', 'looking for tutorial', 'Quick question', 'help needed', 'plz help', "what's wrong", "need some help", '[q]', '[Beginner Question]'] ''' request_Made = False #print text for phrase in phrase_set: if ' '.join(summarizeText.parseStringSimple01(phrase)).lower() in text: logging.info(phrase + " Was used in the post title") request_Made = True break #if submission.id not in submission.url: # Links off site # This check was not in early versions of the bot (v pa0.1.01 and earlier) # logging.debug( '\t'+'Results: Error. Classification is dead, (URL) Mismatch. ') # request_Made = False return request_Made
def xrequest_Key_Word_Filter(submission, phrase_set): ''' This is a hand made feature set to id titles that make reddit-typical requests for help, but might not phrase the request as a question ''' # Consider adding a time delay here? text = ' '.join(summarizeText.parseStringSimple01(submission.title)) # Pass phrase_set through string parser and back, it'll help? #phrase_set = botHelperFunctions.load_autoreply_key_phrases(fl_path='misc/autoreplyKeyPhrases.txt') ''' phrase_set = ['need help', '[help]', '[ help ]', '[question]', '[ question ]', 'noob ', 'n00b ', ' newb','please help', 'noobie question', 'help!', 'help me', "isn't working", 'not working', 'issues with', 'issue with', 'looking for tutorial', 'Quick question', 'help needed', 'plz help', "what's wrong", "need some help", '[q]', '[Beginner Question]'] ''' request_Made = False #print text for phrase in phrase_set: if ' '.join(summarizeText.parseStringSimple01(phrase)).lower() in text: logging.info(phrase + " Was used in the post title") request_Made = True break if request_Made: text = summarizeText.parseStringSimple01(submission.selftext) sents = nltk.sent_tokenize(' '.join(text)) # Returning the last sentence is chosen purely based on a guess # It'll be more useful to select sents based on idf and entropy score try: return sents[-1] except: logging.info( "Failed to grab last sentence: Probably links offsite") pass return False
def basicQuestionClassify(submission, classifier): """ A really simple classifier. if a submission is old enough, has low enough votes and asks a question, it's treated as a basic question that r/learnpython is better suited for. Parameters ---------- submission : praw submission object user : praw user object classifier : nltk classifier object tdm : term document matrix object Returns ------- Notes ----- It's not what I want, but it'll force me to References ---------- Examples -------- """ #title = summarizeText.parseStringSimple01(submission.title) #text = summarizeText.parseStringSimple01(submission.selftext) #postText = title + text postAge = datetime.datetime.utcnow() - submission.created_utc hours2 = datetime.timedelta(hours=2) #logging.debug( '\t'+"Post Age: "+ str(postAge) ) votes = submission.score upvoteRatio = submission.upvote_ratio #print title #logging.debug( '\t'+ "Votes: "+ str(votes)) #logging.debug( '\t'+ "Upvote Ratio: "+str( upvoteRatio)) if postAge < hours2: return False if votes > 0: return False if upvoteRatio > 0.41: return False if submission.id not in submission.url: # Links off site logging.debug( '\t'+'Results: Error. Classification is dead, (URL) Mismatch. ') return False # ID if a question is here right now title = summarizeText.parseStringSimple01(submission.title, removeURL=True) text = summarizeText.parseStringSimple01(submission.selftext, removeURL=True) postText = title + text sents = nltk.sent_tokenize(' '.join(postText))# nltk.sent_tokenize(title) + nltk.sent_tokenize(text) #print " ".join(sents) question_Sents = [] for sent in sents: #sentDisplay = ' '.join(sent.strip().split('\n')) #print '\t', sentDisplay.strip() classified = questionIdentifier.classifyString(sent, classifier) #print classified if "question" in classified.lower(): question_Sents.append(sent) logging.debug(str(classified) +': '+ str(sent)) print('\tSentence: ', sent) print('\tClassified As: ', classified) #print '\t', classified if len(question_Sents) > 0: logging.info('|'.join(question_Sents)) return question_Sents logging.debug("\tNo Question Identified") # All else return False