예제 #1
0
def create_users():
    session = Session()
    pgs = session.query(Page).filter(Page.title.like('User:%'))
    counter = 0
    for page in pgs:
        if '/' in page.title:
            continue
        user = User.make_user_from_page(page, session)
        session.add(user)
        counter += 1
        if counter % 1000 == 0:
            print(counter)
            session.commit()
    session.commit()
예제 #2
0
def create_users():
    session = Session()
    pgs = session.query(Page).filter(Page.title.like('User:%'))
    counter = 0
    for page in pgs:
        if '/' in page.title:
            continue
        user = User.make_user_from_page(page, session)
        session.add(user)
        counter += 1
        if counter % 1000 == 0:
            print(counter)
            session.commit()
    session.commit()
예제 #3
0
 def from_page(cls, page, session):
     if not page.title.startswith('Talk:'):
         raise ValueError(u"Page {0} is not a talk page.".format(
             page.title))
     if not page.text:
         return []
     for bit in page.text.split('(UTC)')[:-1]:
         if not bit:
             continue
         #get username
         pattern = re.compile(
             '(?P<before>.*)\[\[User:(?P<username>.+?)\|.+\]\](?P<after>.*)'
         )
         special_pattern = re.compile('\[\[Special:Contributions.+|.+\]\]')
         match = re.search(pattern, bit)
         if not match:
             # If it's a non-registered user don't give a warning.
             special_match = re.search(special_pattern, bit)
             if not special_match:
                 logger.debug(
                     u"Page {0}: Could not find user in '{1}'.".format(
                         page.title, bit))
             continue
         gd = match.groupdict()
         username = gd['username']
         before = gd['before']
         after = gd['after']
         user = User.make_user(username, page.language, session)
         # Make sure the user get saved so it can be found.
         session.commit()
         datestr = ' '.join(after.split()[-4:])
         possible_date_formats = ('%H:%M, %d %b %Y', '%H:%M, %d %B %Y',
                                  '%d %b %Y %H:%M', '%d %B %Y %H:%M',
                                  '%H:%M %b %d, %Y',
                                  ') %Y-%m-%d %H:%M:%s.</small>')
         date = None
         for df in possible_date_formats:
             try:
                 date = datetime.datetime.strptime(datestr, df)
                 break
             except ValueError:
                 pass
         if date is None:
             logger.debug(
                 u"Could not parse date string '{0}'.".format(datestr))
         if date is not None:
             session.add(Comment(user, page, date, before))
예제 #4
0
 def from_page(cls, page, session):
     if not page.title.startswith('Talk:'):
         raise ValueError(u"Page {0} is not a talk page.".format(page.title))
     if not page.text:
         return []
     for bit in page.text.split('(UTC)')[:-1]:
         if not bit:
             continue
         #get username
         pattern = re.compile('(?P<before>.*)\[\[User:(?P<username>.+?)\|.+\]\](?P<after>.*)')
         special_pattern = re.compile('\[\[Special:Contributions.+|.+\]\]')
         match = re.search(pattern, bit)
         if not match:
             # If it's a non-registered user don't give a warning.
             special_match = re.search(special_pattern, bit)
             if not special_match:
                 logger.debug(u"Page {0}: Could not find user in '{1}'.".format(page.title, bit))
             continue
         gd = match.groupdict()
         username = gd['username']
         before = gd['before']
         after = gd['after']
         user = User.make_user(username, page.language, session)
         # Make sure the user get saved so it can be found.
         session.commit()
         datestr = ' '.join(after.split()[-4:])
         possible_date_formats = ('%H:%M, %d %b %Y',
                                  '%H:%M, %d %B %Y',
                                  '%d %b %Y %H:%M',
                                  '%d %B %Y %H:%M',
                                  '%H:%M %b %d, %Y',
                                  ') %Y-%m-%d %H:%M:%s.</small>'
                                  )
         date = None
         for df in possible_date_formats:
             try:
                 date = datetime.datetime.strptime(datestr, df)
                 break
             except ValueError:
                 pass
         if date is None:
             logger.debug(u"Could not parse date string '{0}'.".format(datestr))
         if date is not None:
             session.add(Comment(user, page, date, before))