def QandA_formatter(iterable): question_dismissed = False for question_, answer_ in iterable: if not question_.body and question_dismissed: continue question_dismissed = False question = shorten_question( flatten_paragraph( formats.link.remove( question_.body))) if question is None: question_dismissed = True continue if not question and question_.body: continue yield ( formats.link.apply(question, question_.permalink + '?context=5') + tallies(question_), formats.link.shorten(answer_.body) + tallies(answer_) )
def QandA_splitquestions(iterable): for question, answer in iterable: q = list( flatten_paragraph(reformat( formats.link.remove(p) )) for p in remove_thank_blocks(readers.blocks_nosep(question.body)) ) a = list(remove_thank_blocks(readers.blocks_nosep( formats.link.shorten(answer.body) ))) # first, try to match blockquotes against questions # and treat fully strong/emphasised paragraphs as such(courtesy of mojangles) quote = False last_quote = False q_ = [] a_ = [] for phrase, rphrase in ((phrase_.strip(), phrase_) for phrase_ in a): if is_blockquote_attempt(phrase, rphrase): phrase = phrase.lstrip('*> "').rstrip(' *"') if phrase in q or phrase in question.body: if last_quote: q_[-1].append(phrase) else: q_.append([phrase]) last_quote = quote = True elif quote: if last_quote: a_.append([phrase]) else: a_[-1].append(phrase) last_quote = False if quote: if not (q_ and a_): continue try: q, a = [list(chain(*l)) for l in zip( *[ ([' '.join(ques)] + [''] * (len(ans) - 1), ans) for ques, ans in izip(q_, a_) ] ) ] except ValueError: print(q_, a_) raise diff = 0 else: diff = len(q) - len(a) if len(q) == 1 or len(a) == 1: q = [' '.join(q)] diff = 0 if diff < 0: q_ = list(chain.from_iterable(questions(q__) for q__ in q)) if not q_: a = [' '.join(a)] diff = 0 else: diff_ = len(q_) - len(a) if not diff_: q = q_ diff = 0 if diff: words_q = list(words(phrase) for phrase in q) words_a = list(words(phrase) for phrase in a) scores = {} total = 0 amount = 1 for i, wordset in enumerate(words_q): for j, wordset_ in enumerate(words_a): inter = wordset & wordset_ score = len(inter) scores[(i,j)] = score if score: total += score amount += 1 avg = total / float(amount) q_ = [] a_ = [] positions = {} answer_positions = {} used_q = [] used_a = [] for (i, j), score in sorted(scores.items()): if score > avg: pos = positions.get(i, None) pos = answer_positions.get(j, pos) if pos == None: positions[i] = pos = len(q_) q_.append([]) a_.append([]) if i not in used_q: q_[pos].append(q[i]) used_q.append(i) if j not in used_a: a_[pos].append(a[j]) used_a.append(j) answer_positions[j] = pos q = (' '.join(phrase) for phrase in q_) a = (' '.join(phrase) for phrase in a_) diff = 0 q = filter(None, q) a = filter(None, a) if not (q and a): continue for q_, a_ in izip_longest(q, a, fillvalue=''): question_ = copy(question) question_.body = q_ answer_ = copy(answer) answer_.body = reformat(a_) yield question_, answer_