Exemplo n.º 1
0
def analyseArticle(file_name):

    # Container of relationships.
    relations = {}

    # Revisions to compare.
    revision_curr = Revision()
    revision_prev = Revision()
    text_curr = None

    # Access the file.
    dumpIterator = dump.Iterator(file_name)

    # Iterate over the pages.
    for page in dumpIterator.readPages():
        i = 0

        # Iterate over revisions of the article.
        for revision in page.readRevisions():
            vandalism = False

            #print "processing rev", revision.getId()

            # Update the information about the previous revision.
            revision_prev = revision_curr

            if (revision.getSha1() == None):
                revision.setSha1(
                    Text.calculateHash(revision.getText().encode("utf-8")))

            if (revision.getSha1() in spam):
                vandalism = True

            #TODO: SPAM detection: DELETION
            if (revision.getComment() != None
                    and revision.getComment().find(FLAG) > 0):
                pass
            else:
                if (revision_prev.length > PREVIOUS_LENGTH) and (len(
                        revision.getText()) < CURR_LENGTH) and ((
                            (len(revision.getText()) - revision_prev.length) /
                            float(revision_prev.length)) <= CHANGE_PERCENTAGE):
                    vandalism = True
                    revision_curr = revision_prev

            #if (vandalism):
            #print "---------------------------- FLAG 1"
            #print "SPAM", revision.getId()
            #print revision.getText()
            #print

            if (not vandalism):
                # Information about the current revision.
                revision_curr = Revision()
                revision_curr.id = i
                revision_curr.wikipedia_id = int(revision.getId())
                revision_curr.length = len(revision.getText())
                revision_curr.timestamp = revision.getTimestamp()
                revision_curr.comment = revision.getComment()

                # Relation of the current relation.
                relation = Relation()
                relation.revision = int(revision.getId())
                relation.length = len(revision.getText())

                # Some revisions don't have contributor.
                if (revision.getContributor() != None):
                    revision_curr.contributor_id = revision.getContributor(
                    ).getId()
                    revision_curr.contributor_name = revision.getContributor(
                    ).getUsername().encode('utf-8')
                    relation.author = revision.getContributor().getUsername(
                    ).encode('utf-8')
                else:
                    revision_curr.contributor_id = 'Not Available ' + revision.getId(
                    )
                    revision_curr.contribur_name = 'Not Available ' + revision.getId(
                    )
                    relation.author = 'Not Available ' + revision.getId()

                # Content within the revision.
                text_curr = revision.getText().encode('utf-8')
                text_curr = text_curr.lower()
                revision_curr.content = text_curr

                # Perform comparison.
                vandalism = determineAuthorship(revision_curr, revision_prev,
                                                text_curr, relation)

                if (not vandalism):
                    #print "NOT SPAM", revision.getId()

                    # Add the current revision with all the information.
                    revisions.update(
                        {revision_curr.wikipedia_id: revision_curr})
                    relations.update({revision_curr.wikipedia_id: relation})
                    revision_order.append((revision_curr.wikipedia_id, False))
                    # Update the fake revision id.
                    i = i + 1

                    # Calculate the number of tokens in the revision.
                    total = 0
                    for p in revision_curr.ordered_paragraphs:
                        for paragraph_curr in revision_curr.paragraphs[p]:
                            for hash_sentence_curr in paragraph_curr.sentences.keys(
                            ):
                                for sentence_curr in paragraph_curr.sentences[
                                        hash_sentence_curr]:
                                    total = total + len(sentence_curr.words)
                    revision_curr.total_tokens = total
                    relation.total_tokens = total

                else:
                    #print "---------------------------- FLAG 2"
                    #print "SPAM", revision.getId()
                    #print revision.getText()
                    #print
                    revision_order.append((revision_curr.wikipedia_id, True))
                    revision_curr = revision_prev
                    spam.append(revision.getSha1())

    return (revisions, revision_order, relations)
Exemplo n.º 2
0
def analyseArticle(file_name):
    
    # Container of revisions.
    revisions = {}
    
    # Revisions to compare.
    revision_curr = Revision()
    revision_prev = Revision()
    text_curr = None

    # Access the file.
    dumpIterator = dump.Iterator(file_name)
    
    # Iterate over the pages.
    for page in dumpIterator.readPages():
        i = 0
        
        # Iterate over revisions of the article.
        for revision in page.readRevisions():
            vandalism = False
            
            # Update the information about the previous revision.
            revision_prev = revision_curr
            
            if (revision.getSha1() == None):
                revision.setSha1(Text.calculateHash(revision.getText().encode("utf-8")))
            
            if (revision.getSha1() in spam):
                vandalism = True
            
            #TODO: SPAM detection: DELETION
            if (revision.getComment()!= None and revision.getComment().find(FLAG) > 0):
                pass
            else:
                if (revision_prev.length > PREVIOUS_LENGTH) and (len(revision.getText()) < CURR_LENGTH) and (((len(revision.getText())-revision_prev.length)/float(revision_prev.length)) <= CHANGE_PERCENTAGE):
                    vandalism = True
                    revision_curr = revision_prev
            
            #if (vandalism):
                #print "---------------------------- FLAG 1"
                #print revision.getId()
                #print revision.getText()           
                #print
            
            if (not vandalism):
                # Information about the current revision.
                revision_curr = Revision()
                revision_curr.id = i
                revision_curr.wikipedia_id = int(revision.getId())
                revision_curr.length = len(revision.getText())
                
                # Some revisions don't have contributor.
                if (revision.getContributor() != None):
                    revision_curr.contributor_id = revision.getContributor().getId()
                    revision_curr.contributor_name = revision.getContributor().getUsername()
                else:
                    revision_curr.contributor_id = 'Not Available'
                    revision_curr.contribur_name = 'Not Available'
                
                # Content within the revision.
                text_curr = revision.getText().encode('utf-8')
                text_curr = text_curr.lower()
                revision_curr.content = text_curr 
                             
                # Perform comparison.
                vandalism = determineAuthorship(revision_curr, revision_prev, text_curr)
                
            
                if (not vandalism):
                    # Add the current revision with all the information.
                    revisions.update({revision_curr.wikipedia_id : revision_curr})
                    # Update the fake revision id.
                    i = i+1
                        
                else:
                    #print "---------------------------- FLAG 2"
                    #print revision.getId()
                    #print revision.getText()
                    #print
                    revision_curr = revision_prev
                    spam.append(revision.getSha1())
           
    
    return revisions
Exemplo n.º 3
0
def getTagDatesFromPage(file_name):
    # Compile regexp
    reglist = list()
    reglist.append({
        "tagname":
        "maintained",
        "regexp":
        re.compile(
            '\{\{(articleissues\|((?:(?!\}\}).)*\||)|multiple issues\|((?:(?!\}\}).)*\||)|)maintained((\||=)(?:(?!\}\}).)*|)\}\}',
            re.IGNORECASE)
    })
    reglist.append({
        "tagname":
        "good article",
        "regexp":
        re.compile(
            '\{\{(articleissues\|((?:(?!\}\}).)*\||)|multiple issues\|((?:(?!\}\}).)*\||)|)good article((\||=)(?:(?!\}\}).)*|)\}\}',
            re.IGNORECASE)
    })
    #reglist.append({"tagname": "featured article", "regexp": re.compile('\{\{(articleissues\|((?:(?!\}\}).)*\||)|multiple issues\|((?:(?!\}\}).)*\||)|)featured article((\||=)(?:(?!\}\}).)*|)\}\}', re.IGNORECASE)})
    #reglist.append({"tagname": "featured article", "regexp": re.compile('\|currentstatus=FA', re.IGNORECASE)})
    reglist.append({
        "tagname":
        "npov",
        "regexp":
        re.compile(
            '\{\{(articleissues\|((?:(?!\}\}).)*\||)|multiple issues\|((?:(?!\}\}).)*\||)|)(pov|npov)((\||=)(?:(?!\}\}).)*|)\}\}',
            re.IGNORECASE)
    })
    reglist.append({
        "tagname":
        "disputed",
        "regexp":
        re.compile(
            '\{\{(articleissues\|((?:(?!\}\}).)*\||)|multiple issues\|((?:(?!\}\}).)*\||)|)disputed((\||=)(?:(?!\}\}).)*|)\}\}',
            re.IGNORECASE)
    })
    re_user = re.compile('({{|\[\[)user.*?[:|](.*?)[}/\]|]', re.IGNORECASE)
    #"({{|\[\[)user[\s\S]*?[:|]([\s\S]*?)[}/\]|]"

    # Access the file.
    dumpIterator = dump.Iterator(file_name)

    # Revisions to compare.
    revision_curr = Revision()
    revision_prev = Revision()
    text_curr = None

    listOfTagChanges = {}
    all_contributors = {
        "maintained": {},
        "good article": {},
        "featured article": {},
        "npov": {},
        "disputed": {}
    }

    # Iterate over the pages.
    for page in dumpIterator.readPages():
        # Iterate over revisions of the article.
        i = 0
        prev_matched = list()
        for revision in page.readRevisions():
            revision.wikipedia_id = int(revision.getId())
            revision.timestamp = revision.getTimestamp()
            # Some revisions don't have contributor.
            if (revision.getContributor() != None):
                revision.contributor_id = revision.getContributor().getId()
                revision.contributor_name = revision.getContributor(
                ).getUsername()
            else:
                revision.contributor_id = 'Not Available'
                revision.contribur_name = 'Not Available'

            text_curr = revision.getText()
            if (text_curr):
                text_curr = text_curr.encode('utf-8')
                text_curr = text_curr.lower()
            else:
                continue
            matched = list()
            aux = list()

            for regexp in reglist:
                m = regexp["regexp"].search(text_curr)
                if m:
                    mc = re_user.split(m.group(0))

                    i = 2
                    users = []
                    users.append(mc[2])
                    while (i + 3 < len(mc)):
                        #print m[i+3]
                        users.append(mc[i + 3])
                        i = i + 3

                    #print regexp["tagname"], contributor
                    #print regexp["tagname"], users
                    matched.append(regexp["tagname"])
                    aux.append((regexp["tagname"], users))

                    ##
                    #m_user = re_user.search(m.group(0))
                    #contributor = m_user.group(2)
            if "|currentstatus=FA" in text_curr:
                matched.append("featured article")
                aux.append((revision.contributor_name, "featured article"))

            # Calculate additions
            for (match, contributor) in aux:
                if not (match in prev_matched):
                    if not (revision.timestamp in listOfTagChanges.keys()):
                        listOfTagChanges[revision.timestamp] = list()
                    listOfTagChanges[revision.timestamp].append({
                        "rev":
                        revision.wikipedia_id,
                        "type":
                        "addition",
                        "tagname":
                        match,
                        "wikiname":
                        revision.contributor_name,
                        "timestamp":
                        revision.timestamp,
                        "date":
                        datetime.datetime.fromtimestamp(int(
                            revision.timestamp)).strftime('%Y-%m-%d %H:%M:%S')
                    })
                all_contributors[match].update({
                    revision.timestamp: {
                        "rev":
                        revision.wikipedia_id,
                        "user":
                        contributor,
                        "date":
                        datetime.datetime.fromtimestamp(int(
                            revision.timestamp)).strftime('%Y-%m-%d %H:%M:%S')
                    }
                })

            # Calculate removals
            for match in prev_matched:
                if not (match in matched):
                    if not (revision.timestamp in listOfTagChanges.keys()):
                        listOfTagChanges[revision.timestamp] = list()
                    listOfTagChanges[revision.timestamp].append({
                        "rev":
                        revision.wikipedia_id,
                        "type":
                        "removal",
                        "tagname":
                        match,
                        "wikiname":
                        revision.contributor_name,
                        "timestamp":
                        revision.timestamp,
                        "date":
                        datetime.datetime.fromtimestamp(int(
                            revision.timestamp)).strftime('%Y-%m-%d %H:%M:%S')
                    })

            prev_matched = matched

    return listOfTagChanges, all_contributors
Exemplo n.º 4
0
def analyseArticle(file_name):
    
    
    
    # Container of relationships.
    relations = {}
    
    # Revisions to compare.
    revision_curr = Revision()
    revision_prev = Revision()
    text_curr = None

    # Access the file.
    dumpIterator = dump.Iterator(file_name)
    
    # Iterate over the pages.
    for page in dumpIterator.readPages():
        i = 0
        
        # Iterate over revisions of the article.
        for revision in page.readRevisions():
            vandalism = False
            
            #print "processing rev", revision.getId()
            
            # Update the information about the previous revision.
            revision_prev = revision_curr
            
            if (revision.getSha1() == None):
                revision.setSha1(Text.calculateHash(revision.getText().encode("utf-8")))
            
            if (revision.getSha1() in spam):
                vandalism = True
            
            #TODO: SPAM detection: DELETION
            if (revision.getComment()!= None and revision.getComment().find(FLAG) > 0):
                pass
            else:
                if (revision_prev.length > PREVIOUS_LENGTH) and (len(revision.getText()) < CURR_LENGTH) and (((len(revision.getText())-revision_prev.length)/float(revision_prev.length)) <= CHANGE_PERCENTAGE):
                    vandalism = True
                    revision_curr = revision_prev
            
            #if (vandalism):
                #print "---------------------------- FLAG 1"
                #print "SPAM", revision.getId()
                #print revision.getText()           
                #print
            
            if (not vandalism):
                # Information about the current revision.
                revision_curr = Revision()
                revision_curr.id = i
                revision_curr.wikipedia_id = int(revision.getId())
                revision_curr.length = len(revision.getText())
                revision_curr.timestamp = revision.getTimestamp()
                revision_curr.comment = revision.getComment()
                
                # Relation of the current relation.
                relation = Relation()
                relation.revision = int(revision.getId())
                relation.length = len(revision.getText())
                
                # Some revisions don't have contributor.
                if (revision.getContributor() != None):
                    revision_curr.contributor_id = revision.getContributor().getId()
                    revision_curr.contributor_name = revision.getContributor().getUsername().encode('utf-8')
                    relation.author = revision.getContributor().getUsername().encode('utf-8')
                else:
                    revision_curr.contributor_id = 'Not Available ' + revision.getId()
                    revision_curr.contribur_name = 'Not Available ' + revision.getId()
                    relation.author = 'Not Available ' + revision.getId()
                
                # Content within the revision.
                text_curr = revision.getText().encode('utf-8')
                text_curr = text_curr.lower()
                revision_curr.content = text_curr 
                             
                # Perform comparison.
                vandalism = determineAuthorship(revision_curr, revision_prev, text_curr, relation)
                
            
                if (not vandalism):
                    #print "NOT SPAM", revision.getId()
                    
                    # Add the current revision with all the information.
                    revisions.update({revision_curr.wikipedia_id : revision_curr})
                    relations.update({revision_curr.wikipedia_id : relation})
                    revision_order.append((revision_curr.wikipedia_id, False))
                    # Update the fake revision id.
                    i = i+1
                    
                    # Calculate the number of tokens in the revision.
                    total = 0
                    for p in revision_curr.ordered_paragraphs:
                        for paragraph_curr in revision_curr.paragraphs[p]:
                            for hash_sentence_curr in paragraph_curr.sentences.keys():
                                for sentence_curr in paragraph_curr.sentences[hash_sentence_curr]:
                                    total = total + len(sentence_curr.words)
                    revision_curr.total_tokens = total
                    relation.total_tokens = total
                    
                        
                        
                else:
                    #print "---------------------------- FLAG 2"
                    #print "SPAM", revision.getId()
                    #print revision.getText()
                    #print
                    revision_order.append((revision_curr.wikipedia_id, True))
                    revision_curr = revision_prev
                    spam.append(revision.getSha1())
                    
           
    
    return (revisions, revision_order, relations)
Exemplo n.º 5
0
def analyseArticle(file_name):

    # Container of revisions.
    revisions = {}
    revision_order = []

    # Revisions to compare.
    revision_curr = Revision()
    revision_prev = Revision()
    text_curr = None

    # Access the file.
    dumpIterator = dump.Iterator(file_name)

    # Iterate over the pages.
    for page in dumpIterator.readPages():
        i = 0

        # Iterate over revisions of the article.
        for revision in page.readRevisions():
            vandalism = False

            # Update the information about the previous revision.
            revision_prev = revision_curr

            if (revision.getSha1() == None):
                revision.setSha1(
                    Text.calculateHash(revision.getText().encode("utf-8")))

            if (revision.getSha1() in spam):
                vandalism = True

            #TODO: SPAM detection: DELETION
            if (revision.getComment() != None
                    and revision.getComment().find(FLAG) > 0):
                pass
            else:
                if (revision_prev.length > PREVIOUS_LENGTH) and (len(
                        revision.getText()) < CURR_LENGTH) and ((
                            (len(revision.getText()) - revision_prev.length) /
                            float(revision_prev.length)) <= CHANGE_PERCENTAGE):
                    print "VANDALISM: CHANGE PERCETANGE"
                    vandalism = True

            #if (vandalism):
            #print "---------------------------- FLAG 1"
            #print revision.getId()
            #print revision.getText()
            #print

            if (not vandalism):
                # Information about the current revision.
                revision_curr = Revision()
                revision_curr.id = i
                revision_curr.wikipedia_id = int(revision.getId())
                revision_curr.length = len(revision.getText())

                # Some revisions don't have contributor.
                if (revision.getContributor() != None):
                    revision_curr.contributor_id = revision.getContributor(
                    ).getId()
                    revision_curr.contributor_name = revision.getContributor(
                    ).getUsername()
                else:
                    revision_curr.contributor_id = 'Not Available'
                    revision_curr.contribur_name = 'Not Available'

                # Content within the revision.
                text_curr = revision.getText().encode('utf-8')
                text_curr = text_curr.lower()
                revision_curr.content = text_curr

                # Perform comparison.
                vandalism = determineAuthorship(revision_curr, revision_prev,
                                                text_curr)

                if (not vandalism):
                    # Add the current revision with all the information.
                    revisions.update(
                        {revision_curr.wikipedia_id: revision_curr})
                    # Update the fake revision id.
                    i = i + 1
                    # Update the index of processed revisions.
                    revision_order.append((revision_curr.wikipedia_id, False))

                else:
                    #print "detected vandalism in here ...................................."
                    #print "---------------------------- FLAG 2"
                    #print revision.getId()
                    #print revision.getText()
                    #print
                    spam.append(revision.getSha1())
                    revision_order.append((revision_curr.wikipedia_id, True))
                    revision_curr = revision_prev

            else:
                #	revision.getText()
                #    #print
                spam.append(revision.getSha1())
                revision_order.append((revision_curr.wikipedia_id, True))
                revision_curr = revision_prev

    return (revisions, revision_order)
Exemplo n.º 6
0
def analyseArticle(file_name):
    # Container of relationships.
    relations = {}

    # Revisions to compare.
    revision_curr = Revision()
    revision_prev = Revision()
    text_curr = None

    # Access the file.
    dumpIterator = mwIterator.from_file(open_file(file_name))

    # Iterate over the pages.
    for page in dumpIterator:
        i = 0

        # Iterate over revisions of the article.
        for revision in page:
            vandalism = False

            # Update the information about the previous revision.
            revision_prev = revision_curr

            if (revision.sha1 == None):
                revision.sha1 = Text.calculateHash(revision.text)

            if (revision.sha1 in spam):
                vandalism = True

            #TODO: SPAM detection: DELETION
            if (revision.comment!= None and revision.comment.find(FLAG) > 0):
                pass
            else:
                if (revision_prev.length > PREVIOUS_LENGTH) and (len(revision.text) < CURR_LENGTH) and (((len(revision.text)-revision_prev.length)/float(revision_prev.length)) <= CHANGE_PERCENTAGE):
                    vandalism = True
                    revision_curr = revision_prev

            if (not vandalism):
                # Information about the current revision.
                revision_curr = Revision()
                revision_curr.id = i
                revision_curr.wikipedia_id = int(revision.id)
                revision_curr.length = len(revision.text)
                revision_curr.timestamp = revision.timestamp

                # Relation of the current relation.
                relation = Relation()
                relation.revision = int(revision.id)
                relation.length = len(revision.text)

                # Some revisions don't have contributor.
                if (revision.contributor != None):
                    revision_curr.contributor_id = revision.contributor.id
                    revision_curr.contributor_name = revision.contributor.user_text
                    relation.author = revision.contributor.user_text
                else:
                    revision_curr.contributor_id = 'Not Available ' + revision.id
                    revision_curr.contribur_name = 'Not Available ' + revision.id
                    relation.author = 'Not Available ' + revision.id

                # Content within the revision.
                text_curr = revision.text.lower()

                # Perform comparison.
                vandalism = determineAuthorship(revision_curr, revision_prev, text_curr, relation)


                if (not vandalism):
                    # Add the current revision with all the information.
                    revisions.update({revision_curr.wikipedia_id : revision_curr})
                    relations.update({revision_curr.wikipedia_id : relation})
                    revision_order.append((revision_curr.wikipedia_id, False))
                    # Update the fake revision id.
                    i = i+1

                    # Calculate the number of tokens in the revision.
                    total = 0
                    for p in revision_curr.ordered_paragraphs:
                        for paragraph_curr in revision_curr.paragraphs[p]:
                            for hash_sentence_curr in paragraph_curr.sentences.keys():
                                for sentence_curr in paragraph_curr.sentences[hash_sentence_curr]:
                                    total = total + len(sentence_curr.words)
                    revision_curr.total_tokens = total
                    relation.total_tokens = total

                else:
                    revision_order.append((revision_curr.wikipedia_id, True))
                    revision_curr = revision_prev
                    spam.append(revision.sha1)

    return (revisions, revision_order, relations)
Exemplo n.º 7
0
def analyseArticle(file_name):
    # Container of relationships.
    relations = {}

    # Revisions to compare.
    revision_curr = Revision()
    revision_prev = Revision()
    text_curr = None

    # Access the file.
    dumpIterator = mwIterator.from_file(open_file(file_name))

    # Iterate over the pages.
    for page in dumpIterator:
        i = 0

        # Iterate over revisions of the article.
        for revision in page:
            vandalism = False

            # Update the information about the previous revision.
            revision_prev = revision_curr

            if (revision.sha1 == None):
                revision.sha1 = Text.calculateHash(revision.text)

            if (revision.sha1 in spam):
                vandalism = True

            #TODO: SPAM detection: DELETION
            if (revision.comment!= None and revision.comment.find(FLAG) > 0):
                pass
            else:
                if (revision_prev.length > PREVIOUS_LENGTH) and (len(revision.text) < CURR_LENGTH) and (((len(revision.text)-revision_prev.length)/float(revision_prev.length)) <= CHANGE_PERCENTAGE):
                    vandalism = True
                    revision_curr = revision_prev

            if (not vandalism):
                # Information about the current revision.
                revision_curr = Revision()
                revision_curr.id = i
                revision_curr.wikipedia_id = int(revision.id)
                revision_curr.length = len(revision.text)
                revision_curr.timestamp = revision.timestamp

                # Relation of the current relation.
                relation = Relation()
                relation.revision = int(revision.id)
                relation.length = len(revision.text)

                # Some revisions don't have contributor.
                if (revision.contributor != None):
                    revision_curr.contributor_id = revision.contributor.id
                    revision_curr.contributor_name = revision.contributor.user_text
                    relation.author = revision.contributor.user_text
                else:
                    revision_curr.contributor_id = 'Not Available ' + revision.id
                    revision_curr.contribur_name = 'Not Available ' + revision.id
                    relation.author = 'Not Available ' + revision.id

                # Content within the revision.
                text_curr = revision.text.lower()

                # Perform comparison.
                vandalism = determineAuthorship(revision_curr, revision_prev, text_curr, relation)


                if (not vandalism):
                    # Add the current revision with all the information.
                    revisions.update({revision_curr.wikipedia_id : revision_curr})
                    relations.update({revision_curr.wikipedia_id : relation})
                    revision_order.append((revision_curr.wikipedia_id, False))
                    # Update the fake revision id.
                    i = i+1

                    # Calculate the number of tokens in the revision.
                    total = 0
                    for p in revision_curr.ordered_paragraphs:
                        for paragraph_curr in revision_curr.paragraphs[p]:
                            for hash_sentence_curr in paragraph_curr.sentences.keys():
                                for sentence_curr in paragraph_curr.sentences[hash_sentence_curr]:
                                    total = total + len(sentence_curr.words)
                    revision_curr.total_tokens = total
                    relation.total_tokens = total

                else:
                    revision_order.append((revision_curr.wikipedia_id, True))
                    revision_curr = revision_prev
                    spam.append(revision.sha1)

    return (revisions, revision_order, relations)