Пример #1
0
    def testTimeFeatures(self):
        print("Start the time feature test")
        a1 = Author("VHS", 5749570, 8192)
        a2 = Author("VHS", 5749570, 8192)
        c1 = Comment("This is a good answer", a1,
                     datetime.datetime(2020, 5, 17, 23, 8, 15))
        c2 = Comment("This is a bad answer", a2,
                     datetime.datetime(2020, 5, 17, 23, 8, 55))

        timeFeatures = Time_Features()
        feature_dict = timeFeatures.getTimeFeature(c1, c2)
        self.assertEqual(feature_dict["tdiff_minute"], 1)

        c3 = Comment("This is an ok answer", a2,
                     datetime.datetime(2020, 5, 21, 23, 8, 55))
        feature_dict = timeFeatures.getTimeFeature(c3, c2)
        self.assertEqual(feature_dict["tdiff_minute"], 0)
        self.assertEqual(feature_dict["tdiff_week"], 1)

        c4 = Comment("This is an ok answer", a2,
                     datetime.datetime(2020, 2, 21, 23, 8, 55))
        feature_dict = timeFeatures.getTimeFeature(c4, c2)
        self.assertEqual(feature_dict["tdiff_minute"], 0)
        self.assertEqual(feature_dict["tdiff_week"], 0)
        self.assertEqual(feature_dict["tdiff_half_year"], 1)

        print("End of the time feature test")
    def testSameThirdSpeaker(self):
        print("Start the refers to the same third speaker test")
        a1 = Author("Holger", 5749570, 18192)
        a2 = Author("Chux", 2410359, 99547)
        c1 = Comment("This is a good answer, @VHS", a1,
                     datetime.datetime(2020, 5, 17))
        c2 = Comment("Thanks @VHS. I appreciate it", a2,
                     datetime.datetime(2010, 5, 17))

        speakerFeature = Speaker_Feature()
        ret = speakerFeature.refersToSameThirdSpeaker(c1, c2)
        self.assertTrue(ret, msg="Refers to same third speaker not detected")

        a3 = Author("Jon Skeet", 22656, 1186748)
        c3 = Comment("+1 VHS for your answer", a3,
                     datetime.datetime(2015, 5, 17))
        ret = speakerFeature.refersToSameThirdSpeaker(c3, c1)
        self.assertFalse(
            ret,
            msg="Refers to same third person shouldn't have been detected")

        c4 = Comment("Thank you @ViralSheth. Glad to be of help", a3,
                     datetime.datetime(2015, 5, 17))
        ret = speakerFeature.refersToSameThirdSpeaker(c4, c1)
        self.assertFalse(
            ret,
            msg="Refers to same third person shouldn't have been detected")

        print("End of the refers to the same third speaker test")
Пример #3
0
 def testUndefinedSemantics(self):
     a1 = Author("SusanW", 5851520, 1461)
     c1 = Comment(
         "@AmrishPandey &quot;finally block is not called in case of exception thrown by daemon thread&quot; - really?? [Citation Needed], I think? Actually <code>thread.stop()</code> does not necessarily prevent <code>finally</code> block from being executed.",
         a1, datetime.datetime(2017, 4, 25))
     a2 = Author("", 0, 0)
     c2 = Comment("@SusanW javarevisited.blogspot.in/2012/03/…", a2,
                  datetime.datetime(2020, 6,
                                    24))  #doesn't match any words in vocab
     self.assertNotEqual(self.semFeature.cosine_similarity(c1, c2), 0.0)
 def testRelated(self):
     a1 = Author("SusanW", 5851520, 1461)
     c1 = Comment("Lambda is a good feature of Java 8", a1,
                  datetime.datetime(2017, 4, 25))
     a2 = Author("", 0, 0)
     c2 = Comment("Java needed lambdas for a long time", a2,
                  datetime.datetime(2020, 6,
                                    24))  #doesn't match any words in vocab
     similarity = self.bertFeature.cosine_similarity(c1, c2)
     print("Similarity on related comments is ", similarity)
     self.assertNotEqual(similarity, 0.0)
Пример #5
0
 def testWeightedIsDifferent(self):
     a1 = Author("", 0, 0)
     c1 = Comment("This is a good java answer", a1,
                  datetime.datetime(2020, 1, 1))
     a2 = Author("", 0, 0)
     c2 = Comment("I like java", a2, datetime.datetime(2020, 1, 1))
     print(self.semFeature.weighted_cosine_similarity(c1, c2))
     print(self.semFeature.cosine_similarity(c1, c2))
     self.assertTrue(
         self.semFeature.weighted_cosine_similarity(c1, c2) !=
         self.semFeature.cosine_similarity(c1, c2))
Пример #6
0
    def testJaccardFeatures(self):
        a1 = Author("VHS", 5749570, 8192)
        a2 = Author("VHS", 5749570, 8192)
        c1 = Comment("Worth noting PrintWriter", a1,
                     datetime.datetime(2020, 5, 17, 23, 8, 15))
        c2 = Comment("PrintWriter is worth", a2,
                     datetime.datetime(2020, 5, 17, 23, 8, 55))

        textSimFeatures = Text_Similarity_Features()
        jaccard_score = textSimFeatures.jaccard_code_feature(c1, c2)

        self.assertEqual(jaccard_score, 1.0)
Пример #7
0
 def testComparativeSemantics_Weighted(self):
     a1 = Author("", 0, 0)
     c1 = Comment("I like junit", a1, datetime.datetime(2020, 1, 1))
     a2 = Author("", 0, 0)
     c2 = Comment("I like java", a2, datetime.datetime(2020, 1, 1))
     a3 = Author("", 0, 0)
     c3 = Comment("I like python", a3, datetime.datetime(2020, 1, 1))
     self.assertTrue(
         self.semFeature.weighted_cosine_similarity(c1, c2) >
         self.semFeature.weighted_cosine_similarity(c2, c3))
     self.assertTrue(
         self.semFeature.weighted_cosine_similarity(c1, c2) >
         self.semFeature.weighted_cosine_similarity(c1, c3))
Пример #8
0
    def testJaccardFeatures(self):
        print("Start the Jaccard feature test")
        a1 = Author("VHS", 5749570, 8192)
        a2 = Author("VHS", 5749570, 8192)
        c1 = Comment("This is a good answer", a1,
                     datetime.datetime(2020, 5, 17, 23, 8, 15))
        c2 = Comment("This is a bad answer", a2,
                     datetime.datetime(2020, 5, 17, 23, 8, 55))

        textSimFeatures = Text_Similarity_Features()
        jaccard_score = textSimFeatures.jaccard_feature(c1, c2)

        self.assertEqual(jaccard_score, 2 / 3)
Пример #9
0
    def retrieveComments(self, ansId):
        webapi = 'https://api.stackexchange.com/2.2/answers/' + str(
            ansId
        ) + '/comments?page=1&pagesize=100&order=asc&sort=creation&site=stackoverflow&key=aT0javmxqIqcfwsWoTDA4w((&filter=!)srVqBhiaSgtcRgeozZw'
        json_data = requests.get(webapi).json()
        #print(json_data)
        comments = []
        for item in json_data['items']:
            #print(item)
            owner = item['owner']
            uname = owner['display_name']
            if 'user_id' in owner:  #If the user is not deleted from SO
                uid = owner['user_id']
                reputation = owner['reputation']
            else:
                uid = None
                reputation = 0

            author = Author(uname, uid, reputation)
            text = item['body']
            epochTime = item['creation_date']
            upvotes = item['score']
            timestamp = datetime.datetime.fromtimestamp(epochTime).strftime(
                '%Y-%m-%d %H:%M:%S')
            comment = Comment(text, author, timestamp, upvotes)
            comments.append(comment)
        return comments
    def testSameSpeaker(self):
        print("Start the same speaker test")
        a1 = Author("VHS", 5749570, 8192)
        a2 = Author("VHS", 5749570, 8192)
        c1 = Comment("This is a good answer", a1,
                     datetime.datetime(2020, 5, 17))
        c2 = Comment("This is a bad answer", a2,
                     datetime.datetime(2010, 5, 17))

        speakerFeature = Speaker_Feature()
        ret = speakerFeature.isSameSpeaker(c1, c2)
        self.assertTrue(ret, msg="Same author is not detected")

        a3 = Author("Jon Skeet", 22656, 1186748)
        c3 = Comment("+1 for the answer", a3, datetime.datetime(2015, 5, 17))
        ret = speakerFeature.isSameSpeaker(c1, c3)
        self.assertFalse(ret, msg="Different author is not detected")

        print("End of the same speaker test")
Пример #11
0
    def loadData(self):
        with open(enrichedFile, newline='', encoding="utf8", errors='ignore') as f:
            reader = csv.reader(f)
            all_posts = [] #list of post that has comments and groups
            post_comments = {} #all comments of a post and their groups
            url=''
            try:
                for row in reader:
                    #If we have the comment from the 3rd column, take that because it is from SO API
                    if not row[2].strip() == '':
                        text = row[2]
                    else:
                        text = row[1]
                        #Take the text only up to the en dash character
                        text = text[:text.index(u"\u2013")]
                    #Restore the original comma    
                    text = text.replace("|COMMA|", ",")
                    
                    timestamp = datetime.datetime.strptime(row[3], '%Y-%m-%d %H:%M:%S')
                    
                    #Author info
                    uid = row[4]
                    uname = row[5]
                    reputation = row[6]                    
                    author = Author(uname, uid, reputation)
                    upvotes = row[8]
                    
                    comment = Comment(text, author, timestamp, upvotes)

                    group = row[7]
                    
                    if not row[0].strip() == url:
                        if post_comments: #if the dict is not empty
                            all_posts.append(copy.deepcopy(post_comments))
                        post_comments.clear()
                        url = row[0].strip()
                        
                    post_comments[comment] = group
                    
            except csv.Error as e:
                sys.exit('file {}, line {}: {}'.format(enrichedFile, reader.line_num, e))
            
            #Add the last comment rating dictionary to the list
            all_posts.append(copy.deepcopy(post_comments))

            return all_posts
Пример #12
0
    def _getCommentWithText(self, comments, text):
        blankAuthor = Author('', 0, 0)
        match = Comment('', blankAuthor,
                        datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
        for comment in comments:
            set1 = set()
            set1.update(word for word in text.strip().split())
            len1 = len(set1)
            set2 = set()
            set2.update(w for w in comment.text.strip().split())
            len2 = len(set2)

            both = set1 | set2
            lenboth = len(both)
            #If most of the words are common, it's the same comment
            if lenboth < 1.25 * len1 and lenboth < 1.25 * len2:
                match = comment
                break
        return match
 def testSimpleBert(self):
     a1 = Author("VHS", 5749570, 8192)
     c1 = Comment("This is a good answer", a1,
                  datetime.datetime(2020, 5, 17))
     self.assertEqual(self.bertFeature.cosine_similarity(c1, c1), 1.0)