def create_users(): session = Session() pgs = session.query(Page).filter(Page.title.like('User:%')) counter = 0 for page in pgs: if '/' in page.title: continue user = User.make_user_from_page(page, session) session.add(user) counter += 1 if counter % 1000 == 0: print(counter) session.commit() session.commit()
def from_page(cls, page, session): if not page.title.startswith('Talk:'): raise ValueError(u"Page {0} is not a talk page.".format( page.title)) if not page.text: return [] for bit in page.text.split('(UTC)')[:-1]: if not bit: continue #get username pattern = re.compile( '(?P<before>.*)\[\[User:(?P<username>.+?)\|.+\]\](?P<after>.*)' ) special_pattern = re.compile('\[\[Special:Contributions.+|.+\]\]') match = re.search(pattern, bit) if not match: # If it's a non-registered user don't give a warning. special_match = re.search(special_pattern, bit) if not special_match: logger.debug( u"Page {0}: Could not find user in '{1}'.".format( page.title, bit)) continue gd = match.groupdict() username = gd['username'] before = gd['before'] after = gd['after'] user = User.make_user(username, page.language, session) # Make sure the user get saved so it can be found. session.commit() datestr = ' '.join(after.split()[-4:]) possible_date_formats = ('%H:%M, %d %b %Y', '%H:%M, %d %B %Y', '%d %b %Y %H:%M', '%d %B %Y %H:%M', '%H:%M %b %d, %Y', ') %Y-%m-%d %H:%M:%s.</small>') date = None for df in possible_date_formats: try: date = datetime.datetime.strptime(datestr, df) break except ValueError: pass if date is None: logger.debug( u"Could not parse date string '{0}'.".format(datestr)) if date is not None: session.add(Comment(user, page, date, before))
def from_page(cls, page, session): if not page.title.startswith('Talk:'): raise ValueError(u"Page {0} is not a talk page.".format(page.title)) if not page.text: return [] for bit in page.text.split('(UTC)')[:-1]: if not bit: continue #get username pattern = re.compile('(?P<before>.*)\[\[User:(?P<username>.+?)\|.+\]\](?P<after>.*)') special_pattern = re.compile('\[\[Special:Contributions.+|.+\]\]') match = re.search(pattern, bit) if not match: # If it's a non-registered user don't give a warning. special_match = re.search(special_pattern, bit) if not special_match: logger.debug(u"Page {0}: Could not find user in '{1}'.".format(page.title, bit)) continue gd = match.groupdict() username = gd['username'] before = gd['before'] after = gd['after'] user = User.make_user(username, page.language, session) # Make sure the user get saved so it can be found. session.commit() datestr = ' '.join(after.split()[-4:]) possible_date_formats = ('%H:%M, %d %b %Y', '%H:%M, %d %B %Y', '%d %b %Y %H:%M', '%d %B %Y %H:%M', '%H:%M %b %d, %Y', ') %Y-%m-%d %H:%M:%s.</small>' ) date = None for df in possible_date_formats: try: date = datetime.datetime.strptime(datestr, df) break except ValueError: pass if date is None: logger.debug(u"Could not parse date string '{0}'.".format(datestr)) if date is not None: session.add(Comment(user, page, date, before))