def testTimeFeatures(self): print("Start the time feature test") a1 = Author("VHS", 5749570, 8192) a2 = Author("VHS", 5749570, 8192) c1 = Comment("This is a good answer", a1, datetime.datetime(2020, 5, 17, 23, 8, 15)) c2 = Comment("This is a bad answer", a2, datetime.datetime(2020, 5, 17, 23, 8, 55)) timeFeatures = Time_Features() feature_dict = timeFeatures.getTimeFeature(c1, c2) self.assertEqual(feature_dict["tdiff_minute"], 1) c3 = Comment("This is an ok answer", a2, datetime.datetime(2020, 5, 21, 23, 8, 55)) feature_dict = timeFeatures.getTimeFeature(c3, c2) self.assertEqual(feature_dict["tdiff_minute"], 0) self.assertEqual(feature_dict["tdiff_week"], 1) c4 = Comment("This is an ok answer", a2, datetime.datetime(2020, 2, 21, 23, 8, 55)) feature_dict = timeFeatures.getTimeFeature(c4, c2) self.assertEqual(feature_dict["tdiff_minute"], 0) self.assertEqual(feature_dict["tdiff_week"], 0) self.assertEqual(feature_dict["tdiff_half_year"], 1) print("End of the time feature test")
def testSameThirdSpeaker(self): print("Start the refers to the same third speaker test") a1 = Author("Holger", 5749570, 18192) a2 = Author("Chux", 2410359, 99547) c1 = Comment("This is a good answer, @VHS", a1, datetime.datetime(2020, 5, 17)) c2 = Comment("Thanks @VHS. I appreciate it", a2, datetime.datetime(2010, 5, 17)) speakerFeature = Speaker_Feature() ret = speakerFeature.refersToSameThirdSpeaker(c1, c2) self.assertTrue(ret, msg="Refers to same third speaker not detected") a3 = Author("Jon Skeet", 22656, 1186748) c3 = Comment("+1 VHS for your answer", a3, datetime.datetime(2015, 5, 17)) ret = speakerFeature.refersToSameThirdSpeaker(c3, c1) self.assertFalse( ret, msg="Refers to same third person shouldn't have been detected") c4 = Comment("Thank you @ViralSheth. Glad to be of help", a3, datetime.datetime(2015, 5, 17)) ret = speakerFeature.refersToSameThirdSpeaker(c4, c1) self.assertFalse( ret, msg="Refers to same third person shouldn't have been detected") print("End of the refers to the same third speaker test")
def testUndefinedSemantics(self): a1 = Author("SusanW", 5851520, 1461) c1 = Comment( "@AmrishPandey "finally block is not called in case of exception thrown by daemon thread" - really?? [Citation Needed], I think? Actually <code>thread.stop()</code> does not necessarily prevent <code>finally</code> block from being executed.", a1, datetime.datetime(2017, 4, 25)) a2 = Author("", 0, 0) c2 = Comment("@SusanW javarevisited.blogspot.in/2012/03/…", a2, datetime.datetime(2020, 6, 24)) #doesn't match any words in vocab self.assertNotEqual(self.semFeature.cosine_similarity(c1, c2), 0.0)
def testRelated(self): a1 = Author("SusanW", 5851520, 1461) c1 = Comment("Lambda is a good feature of Java 8", a1, datetime.datetime(2017, 4, 25)) a2 = Author("", 0, 0) c2 = Comment("Java needed lambdas for a long time", a2, datetime.datetime(2020, 6, 24)) #doesn't match any words in vocab similarity = self.bertFeature.cosine_similarity(c1, c2) print("Similarity on related comments is ", similarity) self.assertNotEqual(similarity, 0.0)
def testWeightedIsDifferent(self): a1 = Author("", 0, 0) c1 = Comment("This is a good java answer", a1, datetime.datetime(2020, 1, 1)) a2 = Author("", 0, 0) c2 = Comment("I like java", a2, datetime.datetime(2020, 1, 1)) print(self.semFeature.weighted_cosine_similarity(c1, c2)) print(self.semFeature.cosine_similarity(c1, c2)) self.assertTrue( self.semFeature.weighted_cosine_similarity(c1, c2) != self.semFeature.cosine_similarity(c1, c2))
def testJaccardFeatures(self): a1 = Author("VHS", 5749570, 8192) a2 = Author("VHS", 5749570, 8192) c1 = Comment("Worth noting PrintWriter", a1, datetime.datetime(2020, 5, 17, 23, 8, 15)) c2 = Comment("PrintWriter is worth", a2, datetime.datetime(2020, 5, 17, 23, 8, 55)) textSimFeatures = Text_Similarity_Features() jaccard_score = textSimFeatures.jaccard_code_feature(c1, c2) self.assertEqual(jaccard_score, 1.0)
def testComparativeSemantics_Weighted(self): a1 = Author("", 0, 0) c1 = Comment("I like junit", a1, datetime.datetime(2020, 1, 1)) a2 = Author("", 0, 0) c2 = Comment("I like java", a2, datetime.datetime(2020, 1, 1)) a3 = Author("", 0, 0) c3 = Comment("I like python", a3, datetime.datetime(2020, 1, 1)) self.assertTrue( self.semFeature.weighted_cosine_similarity(c1, c2) > self.semFeature.weighted_cosine_similarity(c2, c3)) self.assertTrue( self.semFeature.weighted_cosine_similarity(c1, c2) > self.semFeature.weighted_cosine_similarity(c1, c3))
def testJaccardFeatures(self): print("Start the Jaccard feature test") a1 = Author("VHS", 5749570, 8192) a2 = Author("VHS", 5749570, 8192) c1 = Comment("This is a good answer", a1, datetime.datetime(2020, 5, 17, 23, 8, 15)) c2 = Comment("This is a bad answer", a2, datetime.datetime(2020, 5, 17, 23, 8, 55)) textSimFeatures = Text_Similarity_Features() jaccard_score = textSimFeatures.jaccard_feature(c1, c2) self.assertEqual(jaccard_score, 2 / 3)
def retrieveComments(self, ansId): webapi = 'https://api.stackexchange.com/2.2/answers/' + str( ansId ) + '/comments?page=1&pagesize=100&order=asc&sort=creation&site=stackoverflow&key=aT0javmxqIqcfwsWoTDA4w((&filter=!)srVqBhiaSgtcRgeozZw' json_data = requests.get(webapi).json() #print(json_data) comments = [] for item in json_data['items']: #print(item) owner = item['owner'] uname = owner['display_name'] if 'user_id' in owner: #If the user is not deleted from SO uid = owner['user_id'] reputation = owner['reputation'] else: uid = None reputation = 0 author = Author(uname, uid, reputation) text = item['body'] epochTime = item['creation_date'] upvotes = item['score'] timestamp = datetime.datetime.fromtimestamp(epochTime).strftime( '%Y-%m-%d %H:%M:%S') comment = Comment(text, author, timestamp, upvotes) comments.append(comment) return comments
def testSameSpeaker(self): print("Start the same speaker test") a1 = Author("VHS", 5749570, 8192) a2 = Author("VHS", 5749570, 8192) c1 = Comment("This is a good answer", a1, datetime.datetime(2020, 5, 17)) c2 = Comment("This is a bad answer", a2, datetime.datetime(2010, 5, 17)) speakerFeature = Speaker_Feature() ret = speakerFeature.isSameSpeaker(c1, c2) self.assertTrue(ret, msg="Same author is not detected") a3 = Author("Jon Skeet", 22656, 1186748) c3 = Comment("+1 for the answer", a3, datetime.datetime(2015, 5, 17)) ret = speakerFeature.isSameSpeaker(c1, c3) self.assertFalse(ret, msg="Different author is not detected") print("End of the same speaker test")
def loadData(self): with open(enrichedFile, newline='', encoding="utf8", errors='ignore') as f: reader = csv.reader(f) all_posts = [] #list of post that has comments and groups post_comments = {} #all comments of a post and their groups url='' try: for row in reader: #If we have the comment from the 3rd column, take that because it is from SO API if not row[2].strip() == '': text = row[2] else: text = row[1] #Take the text only up to the en dash character text = text[:text.index(u"\u2013")] #Restore the original comma text = text.replace("|COMMA|", ",") timestamp = datetime.datetime.strptime(row[3], '%Y-%m-%d %H:%M:%S') #Author info uid = row[4] uname = row[5] reputation = row[6] author = Author(uname, uid, reputation) upvotes = row[8] comment = Comment(text, author, timestamp, upvotes) group = row[7] if not row[0].strip() == url: if post_comments: #if the dict is not empty all_posts.append(copy.deepcopy(post_comments)) post_comments.clear() url = row[0].strip() post_comments[comment] = group except csv.Error as e: sys.exit('file {}, line {}: {}'.format(enrichedFile, reader.line_num, e)) #Add the last comment rating dictionary to the list all_posts.append(copy.deepcopy(post_comments)) return all_posts
def _getCommentWithText(self, comments, text): blankAuthor = Author('', 0, 0) match = Comment('', blankAuthor, datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) for comment in comments: set1 = set() set1.update(word for word in text.strip().split()) len1 = len(set1) set2 = set() set2.update(w for w in comment.text.strip().split()) len2 = len(set2) both = set1 | set2 lenboth = len(both) #If most of the words are common, it's the same comment if lenboth < 1.25 * len1 and lenboth < 1.25 * len2: match = comment break return match
def testSimpleBert(self): a1 = Author("VHS", 5749570, 8192) c1 = Comment("This is a good answer", a1, datetime.datetime(2020, 5, 17)) self.assertEqual(self.bertFeature.cosine_similarity(c1, c1), 1.0)