def parse_simple_link(s, suffix): m = re.match(r"((?P<title>.*),\s*)?" + link_re_str(suffix), s, re.IGNORECASE) if m is None: return d = m.groupdict() if not d["title"]: d["title"] = fetch_title(d["link"]) else: d["title"] = beautify(d["title"]) return d
def parse_msword_link(s, suffix): m = re.match(r'HYPERLINK\s*"(?P<title>.*)"\s*' + link_re_str(suffix), s, re.IGNORECASE) if m is None: return d = m.groupdict() if d["title"] == d["link"]: d["title"] = fetch_title(d["link"]) else: d["title"] = beautify(d["title"]) return d
def crawl(line): result = {} items = [] t1 = datetime.now() # get domain name of the feed source domain = p.search(line) # generate identical filename for each source file_dir = "./output/" if domain: file_dir += domain.group()[3:].replace('.', '_') else: m.update(line) file_dir += m.hexdigest() output_file = open(file_dir + ".json", "w+") item_count = 0 try: # request data from local server response = urllib2.urlopen(service_url + '&url=' + line) single_page_full_rss = response.read() output_file.write(single_page_full_rss) output_file.close() # get main body of the result and leave the rest behind json_rss = json.loads(single_page_full_rss) content = json_rss['rss']['channel']['item'] if type(content) is list: # extract restricted info fields from results of five-filters for item in content: if item['result'] == 'success': newitem = {} newitem['title'] = item['title'] newitem['datetime'] = item['pubDate'] newitem['content'] = beautifier.beautify(item['description']) newitem['link'] = item['link'] items.append(newitem) item_count += 1 else: l.log("Url extraction failure: \t\t" + item['link']) else: # extraction failure l.log("Feed extraction failure: \t\t" + line) except Exception, e: print e.message pass
def feedback_confirm(request): """ Webhook action to confirm feedback received from the user """ global training_feedback words_to_highlight = [] response_text = "In the phrase '**{}**', the words: \n ".format( training_feedback['original_intent']) for entity, values in training_feedback['entities'].items(): response_text += ", and ".join(value for value in values.keys()) response_text += " \nmust be considered {}(s);".format(entity) words_to_highlight = values.keys() print('words', words_to_highlight) response_text += " \nIs that right?" response = make_card_response("Feedback confirmation", response_text, "Can you confirm your feedback then?", beautify(response_text, words_to_highlight)) return response
def get_answer(topic, keyword, options="", request_options=None): # pylint: disable=too-many-locals,too-many-branches,too-many-statements """ Find cheat sheet for the topic. If `keyword` is None or rempty, return the whole answer. Otherwise cut the paragraphs containing keywords. Args: topic (str): the name of the topic of the cheat sheet keyword (str): the name of the keywords to search in the cheat sheets Returns: string: the cheat sheet """ def _join_paragraphs(paragraphs): answer = "\n".join(paragraphs) return answer def _split_paragraphs(text): answer = [] paragraph = "" for line in text.splitlines(): if line == "": answer.append(paragraph) paragraph = "" else: paragraph += line+"\n" answer.append(paragraph) return answer def _paragraph_contains(paragraph, keyword, insensitive=False, word_boundaries=True): """ Check if `paragraph` contains `keyword`. Several keywords can be joined together using ~ For example: ~ssh~passphrase """ answer = True if '~' in keyword: keywords = keyword.split('~') else: keywords = [keyword] for kwrd in keywords: regex = re.escape(kwrd) if not word_boundaries: regex = r"\b%s\b" % kwrd if insensitive: answer = answer and bool(re.search(regex, paragraph, re.IGNORECASE)) else: answer = answer and bool(re.search(regex, paragraph)) return answer def _rewrite_aliases(word): if word == ':bash.completion': return ':bash_completion' return word def _rewrite_section_name(query): """ """ if '/' not in query: return query section_name, rest = query.split('/', 1) section_name = LANGUAGE_ALIAS.get(section_name, section_name) return "%s/%s" % (section_name, rest) def _rewrite_section_name_for_q(query): """ """ if '/' not in query: return query section_name, rest = query.split('/', 1) section_name = SO_NAME.get(section_name, section_name) print("%s/%s" % (section_name, rest)) return "%s/%s" % (section_name, rest) answer = None needs_beautification = False topic = _rewrite_aliases(topic) topic = _rewrite_section_name(topic) # this is pretty unoptimal # so this part should be rewritten # for the most queries we could say immediately # what type the query has start_time = time.time() topic_type = get_topic_type(topic) print((time.time() - start_time)*1000) # checking if the answer is in the cache if topic != "": # temporary hack for "questions": # the topic name has to be prefixed with q: # so we can later delete them from redis # and we known that they need beautification #if '/' in topic and '+' in topic: if topic_type == 'question': #'/' in topic and '+' in topic: topic = _rewrite_section_name_for_q(topic) topic = "q:" + topic needs_beautification = True answer = REDIS.get(topic) if answer: answer = answer.decode('utf-8') # if answer was not found in the cache # try to find it in one of the repositories if not answer: #topic_type = get_topic_type(topic) for topic_getter_type, topic_getter in TOPIC_GETTERS: if topic_type == topic_getter_type: answer = topic_getter(topic) break if not answer: topic_type = "unknown" answer = _get_unknown(topic) # saving answers in the cache if topic_type not in ["search", "internal", "unknown"]: REDIS.set(topic, answer) if needs_beautification: filetype = 'bash' if '/' in topic: filetype = topic.split('/', 1)[0] if filetype.startswith('q:'): filetype = filetype[2:] answer = beautifier.beautify(answer.encode('utf-8'), filetype, request_options) if not keyword: return answer # # shorten the answer, because keyword is specified # insensitive = 'i' in options word_boundaries = 'b' in options paragraphs = _split_paragraphs(answer) paragraphs = [p for p in paragraphs if _paragraph_contains(p, keyword, insensitive=insensitive, word_boundaries=word_boundaries)] if paragraphs == []: return "" answer = _join_paragraphs(paragraphs) return answer
import beautifier string = "'test_safd' At nearly 7,000 words, you probably don\u2019t want to try</p>sadfsadf" f = open("tmp.txt", "w+") print string.decode('unicode-escape') r = beautifier.beautify(string) f.write(r) print r
def read_title(f): par = read_paragraph(f) if par is not None: return beautify(par.strip())
def extract_tags(s, tags): l = re.split(r"[()]", s) if len(l) < 2: return s tags += [beautify(p.strip()) for p in l[1].split(',')]; return l[0]