def get_medium(self, text): if not text: text = "unknown" if text in MEDIUM_ALIASES.keys(): return Medium.get_or_create(MEDIUM_ALIASES[text]) else: return Medium.get_or_create(text)
def create_medium(self, medium): if not medium or len(medium) < 1: medium = "unknown" if medium in MEDIUM_ALIASES.keys(): return Medium.get_or_create(MEDIUM_ALIASES[medium]) else: return Medium.get_or_create(medium)
def get_medium(text): if not text: text = "unknown" if text in MEDIUM_ALIASES.keys(): return MEDIUM_ALIASES[text] else: return text
def create_medium(self, html): if not html.text: medium = "unknown" else: medium = html.text if medium in MEDIUM_ALIASES.keys(): return Medium.get_or_create(MEDIUM_ALIASES[medium]) else: return Medium.get_or_create(medium)
def _scrape_unit(self, _file): readlines = _file.readlines() file_date_line = [l for l in readlines if l.startswith("Date:")][0] file_date = read_date(file_date_line.split("Date:")[1]) lines = [] mail_header = [] for line in readlines: if lines: lines.append(line.rstrip("\r\n")) else: mail_header.append(line) if line.startswith("1red"): #actual content starts lines.append("") article = Article(metastring={'mail_header': "".join(mail_header)}) while True: #loop through lines up to and including headline line = lines.pop(0) if line.isupper(): article.title = line break elif line: #first non-empty line, contains metadata data = line.split(", ") datestr = data[0] if "'" in datestr: split = datestr.split("'") datestr = split[0] + "20" + split[1] if "=" in datestr: # if this is true, the year is not parsable # we take the year the mail was sent, might fail around december datestr = datestr.split("=")[0] + str(file_date.year) article.date = read_date(datestr) if ( article.date - file_date ).days > 200: #likely a misparse, with the mail being sent the next year article.date -= timedelta(years=1) else: article.date = read_date(datestr) if data[2] in BZK_ALIASES.keys(): medium_str = BZK_ALIASES[data[1]] else: medium_str = data[2] article.set_property("medium", medium_str) article.set_property("section", data[1]) paragraphs = [] paragraph = "" while True: line = lines.pop(0).rstrip("=") if not line: paragraphs.append(paragraph) paragraph = "" elif line.isupper(): #subheader paragraph += line + "\n" else: paragraph += line if not lines: break paragraphs = [p for p in paragraphs if p] article.text = "" for p in paragraphs: article.text += p + "\n\n" if p.startswith("(") and len( p.split(",")) > 1: #laatste regel van normale content break # Add non-ascii characters # Takes the '=AB' occurrences and turns them into latin-1 characters. def character(match): code = match.group()[1:] char = r"\x{}".format(code).decode('string-escape').decode( 'latin-1') if code == "92": return "'" elif code == "85": return "..." return char article.text = re.sub("=[A-Z0-9]{2}", character, article.text) yield article
def get_medium(self, medium): if not medium or len(medium) < 1: medium = "unknown" return MEDIUM_ALIASES.get(medium, medium)
from amcat.models.medium import Medium from amcat.models.article import Article from amcat.scripts.article_upload.bzk_aliases import BZK_ALIASES as aliases for alias, medium in aliases.items(): if alias != medium: print(alias, " > ", medium) #change all articles in project 29 alias = Medium.get_or_create(alias) articles = Article.objects.filter(medium=alias.id, project_id=29) print("{} articles".format(articles.count())) articles.update(medium=Medium.get_or_create(medium).id) #if medium is now empty, delete if Article.objects.filter(medium=alias.id).count() == 0: print('deleting...') alias.delete() else: print('alias is no alias')
#inverting bzk aliases dict from amcat.scripts.article_upload.bzk_aliases import BZK_ALIASES if __name__ == '__main__': new_dict = {} for entry in BZK_ALIASES.items(): for alias in entry[1]: new_dict[alias] = entry[0] print(new_dict) # WVA: WAAROM STAAT DIT HIER? IS DIT NIET HETZELFDE ALS HET SCRIPT IN MAINTENANCE/TMP? # ALS DAT ZO IS, GAARNA HG RM'EN!
def _scrape_unit(self, _file): readlines = _file.readlines() file_date_line = [l for l in readlines if l.startswith("Date:")][0] file_date = readDate(file_date_line.split("Date:")[1]) lines = [] mail_header = [] for line in readlines: if lines: lines.append(line.rstrip("\r\n")) else: mail_header.append(line) if line.startswith("1red"): #actual content starts lines.append("") article = Article(metastring={'mail_header': "".join(mail_header)}) while True: #loop through lines up to and including headline line = lines.pop(0) if line.isupper(): #headline article.headline = line break elif line: #first non-empty line, contains metadata data = line.split(", ") datestr = data[0] if "'" in datestr: split = datestr.split("'") datestr = split[0] + "20" + split[1] if "=" in datestr: # if this is true, the year is not parsable # we take the year the mail was sent, might fail around december datestr = datestr.split("=")[0] + str(file_date.year) article.date = readDate(datestr) if ( article.date - file_date).days > 200: #likely a misparse, with the mail being sent the next year article.date -= timedelta(years=1) else: article.date = readDate(datestr) if data[2] in BZK_ALIASES.keys(): medium_str = BZK_ALIASES[data[1]] else: medium_str = data[2] article.medium = Medium.get_or_create(medium_str) article.section = data[1] paragraphs = [] paragraph = "" while True: line = lines.pop(0).rstrip("=") if not line: paragraphs.append(paragraph) paragraph = "" elif line.isupper(): #subheader paragraph += line + "\n" else: paragraph += line if not lines: break paragraphs = [p for p in paragraphs if p] article.text = "" for p in paragraphs: article.text += p + "\n\n" if p.startswith("(") and len(p.split(",")) > 1: #laatste regel van normale content break # Add non-ascii characters # Takes the '=AB' occurrences and turns them into latin-1 characters. def character(match): code = match.group()[1:] char = r"\x{}".format(code).decode('string-escape').decode('latin-1') if code == "92": return "'" elif code == "85": return "..." return char article.text = re.sub( "=[A-Z0-9]{2}", character, article.text) yield article
from amcat.models.medium import Medium from amcat.models.article import Article from amcat.scripts.article_upload.bzk_aliases import BZK_ALIASES as aliases for alias, medium in aliases.items(): if alias != medium: print(alias, " > ", medium) #change all articles in project 29 alias = Medium.get_or_create(alias) articles = Article.objects.filter(medium = alias.id, project_id = 29) print("{} articles".format(articles.count())) articles.update(medium = Medium.get_or_create(medium).id) #if medium is now empty, delete if Article.objects.filter(medium = alias.id).count() == 0: print('deleting...') alias.delete() else: print('alias is no alias')