Exemplo n.º 1
0
def parse_simple_link(s, suffix):
    m = re.match(r"((?P<title>.*),\s*)?" + link_re_str(suffix),
                 s, re.IGNORECASE)
    if m is None:
        return
    d = m.groupdict()
    if not d["title"]:
        d["title"] = fetch_title(d["link"])
    else:
        d["title"] = beautify(d["title"])
    return d
Exemplo n.º 2
0
def parse_msword_link(s, suffix):
    m = re.match(r'HYPERLINK\s*"(?P<title>.*)"\s*' + link_re_str(suffix),
                 s, re.IGNORECASE)
    if m is None:
        return
    d = m.groupdict()
    if d["title"] == d["link"]:
        d["title"] = fetch_title(d["link"])
    else:
        d["title"] = beautify(d["title"])
    return d
Exemplo n.º 3
0
def crawl(line):
    result = {}
    items = []

    t1 = datetime.now()
    # get domain name of the feed source
    domain = p.search(line)
    # generate identical filename for each source
    file_dir = "./output/"
    if domain:
        file_dir += domain.group()[3:].replace('.', '_')
    else:
        m.update(line)
        file_dir += m.hexdigest()
    output_file = open(file_dir + ".json", "w+")

    item_count = 0

    try:
        # request data from local server
        response = urllib2.urlopen(service_url + '&url=' + line)
        single_page_full_rss = response.read()
        output_file.write(single_page_full_rss)
        output_file.close()

        # get main body of the result and leave the rest behind
        json_rss = json.loads(single_page_full_rss)
        content = json_rss['rss']['channel']['item']
        if type(content) is list:
            # extract restricted info fields from results of five-filters
            for item in content:
                if item['result'] == 'success':
                    newitem = {}
                    newitem['title'] = item['title']
                    newitem['datetime'] = item['pubDate']
                    newitem['content'] = beautifier.beautify(item['description'])
                    newitem['link'] = item['link']
                    items.append(newitem)
                    item_count += 1
                else:
                    l.log("Url extraction failure: \t\t" + item['link'])
        else:
            # extraction failure
            l.log("Feed extraction failure: \t\t" + line)
    except Exception, e:
        print e.message
        pass
Exemplo n.º 4
0
def feedback_confirm(request):
    """ Webhook action to confirm feedback received from the user """

    global training_feedback

    words_to_highlight = []
    response_text = "In the phrase '**{}**', the words:  \n&nbsp;&nbsp;&nbsp;&nbsp;".format(
        training_feedback['original_intent'])

    for entity, values in training_feedback['entities'].items():
        response_text += ", and ".join(value for value in values.keys())
        response_text += "  \nmust be considered {}(s);".format(entity)
        words_to_highlight = values.keys()
        print('words', words_to_highlight)

    response_text += "  \nIs that right?"

    response = make_card_response("Feedback confirmation", response_text,
                                  "Can you confirm your feedback then?",
                                  beautify(response_text, words_to_highlight))
    return response
Exemplo n.º 5
0
def get_answer(topic, keyword, options="", request_options=None): # pylint: disable=too-many-locals,too-many-branches,too-many-statements
    """
    Find cheat sheet for the topic.
    If `keyword` is None or rempty, return the whole answer.
    Otherwise cut the paragraphs containing keywords.

    Args:
        topic (str):    the name of the topic of the cheat sheet
        keyword (str):  the name of the keywords to search in the cheat sheets

    Returns:
        string:         the cheat sheet
    """

    def _join_paragraphs(paragraphs):
        answer = "\n".join(paragraphs)
        return answer

    def _split_paragraphs(text):
        answer = []
        paragraph = ""
        for line in text.splitlines():
            if line == "":
                answer.append(paragraph)
                paragraph = ""
            else:
                paragraph += line+"\n"
        answer.append(paragraph)
        return answer

    def _paragraph_contains(paragraph, keyword, insensitive=False, word_boundaries=True):
        """
        Check if `paragraph` contains `keyword`.
        Several keywords can be joined together using ~
        For example: ~ssh~passphrase
        """
        answer = True

        if '~' in keyword:
            keywords = keyword.split('~')
        else:
            keywords = [keyword]

        for kwrd in keywords:
            regex = re.escape(kwrd)
            if not word_boundaries:
                regex = r"\b%s\b" % kwrd

            if insensitive:
                answer = answer and bool(re.search(regex, paragraph, re.IGNORECASE))
            else:
                answer = answer and bool(re.search(regex, paragraph))

        return answer

    def _rewrite_aliases(word):
        if word == ':bash.completion':
            return ':bash_completion'
        return word

    def _rewrite_section_name(query):
        """
        """
        if '/' not in query:
            return query

        section_name, rest = query.split('/', 1)
        section_name = LANGUAGE_ALIAS.get(section_name, section_name)
        return "%s/%s" % (section_name, rest)

    def _rewrite_section_name_for_q(query):
        """
        """
        if '/' not in query:
            return query

        section_name, rest = query.split('/', 1)
        section_name = SO_NAME.get(section_name, section_name)
        print("%s/%s" % (section_name, rest))
        return "%s/%s" % (section_name, rest)


    answer = None
    needs_beautification = False

    topic = _rewrite_aliases(topic)
    topic = _rewrite_section_name(topic)

    # this is pretty unoptimal
    # so this part should be rewritten
    # for the most queries we could say immediately
    # what type the query has
    start_time = time.time()
    topic_type = get_topic_type(topic)
    print((time.time() - start_time)*1000)

    # checking if the answer is in the cache
    if topic != "":
        # temporary hack for "questions":
        # the topic name has to be prefixed with q:
        # so we can later delete them from redis
        # and we known that they need beautification
        #if '/' in topic and '+' in topic:
        if topic_type == 'question': #'/' in topic and '+' in topic:
            topic = _rewrite_section_name_for_q(topic)
            topic = "q:" + topic
            needs_beautification = True

        answer = REDIS.get(topic)
        if answer:
            answer = answer.decode('utf-8')

    # if answer was not found in the cache
    # try to find it in one of the repositories
    if not answer:
        #topic_type = get_topic_type(topic)

        for topic_getter_type, topic_getter in TOPIC_GETTERS:
            if topic_type == topic_getter_type:
                answer = topic_getter(topic)
                break
        if not answer:
            topic_type = "unknown"
            answer = _get_unknown(topic)

        # saving answers in the cache
        if topic_type not in ["search", "internal", "unknown"]:
            REDIS.set(topic, answer)

    if needs_beautification:
        filetype = 'bash'
        if '/' in topic:
            filetype = topic.split('/', 1)[0]
            if filetype.startswith('q:'):
                filetype = filetype[2:]

        answer = beautifier.beautify(answer.encode('utf-8'), filetype, request_options)

    if not keyword:
        return answer

    #
    # shorten the answer, because keyword is specified
    #
    insensitive = 'i' in options
    word_boundaries = 'b' in options

    paragraphs = _split_paragraphs(answer)
    paragraphs = [p for p in paragraphs
                  if _paragraph_contains(p, keyword,
                                         insensitive=insensitive,
                                         word_boundaries=word_boundaries)]
    if paragraphs == []:
        return ""

    answer = _join_paragraphs(paragraphs)
    return answer
Exemplo n.º 6
0
def get_answer(topic, keyword, options="", request_options=None): # pylint: disable=too-many-locals,too-many-branches,too-many-statements
    """
    Find cheat sheet for the topic.
    If `keyword` is None or rempty, return the whole answer.
    Otherwise cut the paragraphs containing keywords.

    Args:
        topic (str):    the name of the topic of the cheat sheet
        keyword (str):  the name of the keywords to search in the cheat sheets

    Returns:
        string:         the cheat sheet
    """

    def _join_paragraphs(paragraphs):
        answer = "\n".join(paragraphs)
        return answer

    def _split_paragraphs(text):
        answer = []
        paragraph = ""
        for line in text.splitlines():
            if line == "":
                answer.append(paragraph)
                paragraph = ""
            else:
                paragraph += line+"\n"
        answer.append(paragraph)
        return answer

    def _paragraph_contains(paragraph, keyword, insensitive=False, word_boundaries=True):
        """
        Check if `paragraph` contains `keyword`.
        Several keywords can be joined together using ~
        For example: ~ssh~passphrase
        """
        answer = True

        if '~' in keyword:
            keywords = keyword.split('~')
        else:
            keywords = [keyword]

        for kwrd in keywords:
            regex = re.escape(kwrd)
            if not word_boundaries:
                regex = r"\b%s\b" % kwrd

            if insensitive:
                answer = answer and bool(re.search(regex, paragraph, re.IGNORECASE))
            else:
                answer = answer and bool(re.search(regex, paragraph))

        return answer

    def _rewrite_aliases(word):
        if word == ':bash.completion':
            return ':bash_completion'
        return word

    def _rewrite_section_name(query):
        """
        """
        if '/' not in query:
            return query

        section_name, rest = query.split('/', 1)
        section_name = LANGUAGE_ALIAS.get(section_name, section_name)
        return "%s/%s" % (section_name, rest)

    def _rewrite_section_name_for_q(query):
        """
        """
        if '/' not in query:
            return query

        section_name, rest = query.split('/', 1)
        section_name = SO_NAME.get(section_name, section_name)
        print("%s/%s" % (section_name, rest))
        return "%s/%s" % (section_name, rest)


    answer = None
    needs_beautification = False

    topic = _rewrite_aliases(topic)
    topic = _rewrite_section_name(topic)

    # this is pretty unoptimal
    # so this part should be rewritten
    # for the most queries we could say immediately
    # what type the query has
    start_time = time.time()
    topic_type = get_topic_type(topic)
    print((time.time() - start_time)*1000)

    # checking if the answer is in the cache
    if topic != "":
        # temporary hack for "questions":
        # the topic name has to be prefixed with q:
        # so we can later delete them from redis
        # and we known that they need beautification
        #if '/' in topic and '+' in topic:
        if topic_type == 'question': #'/' in topic and '+' in topic:
            topic = _rewrite_section_name_for_q(topic)
            topic = "q:" + topic
            needs_beautification = True

        answer = REDIS.get(topic)
        if answer:
            answer = answer.decode('utf-8')

    # if answer was not found in the cache
    # try to find it in one of the repositories
    if not answer:
        #topic_type = get_topic_type(topic)

        for topic_getter_type, topic_getter in TOPIC_GETTERS:
            if topic_type == topic_getter_type:
                answer = topic_getter(topic)
                break
        if not answer:
            topic_type = "unknown"
            answer = _get_unknown(topic)

        # saving answers in the cache
        if topic_type not in ["search", "internal", "unknown"]:
            REDIS.set(topic, answer)

    if needs_beautification:
        filetype = 'bash'
        if '/' in topic:
            filetype = topic.split('/', 1)[0]
            if filetype.startswith('q:'):
                filetype = filetype[2:]

        answer = beautifier.beautify(answer.encode('utf-8'), filetype, request_options)

    if not keyword:
        return answer

    #
    # shorten the answer, because keyword is specified
    #
    insensitive = 'i' in options
    word_boundaries = 'b' in options

    paragraphs = _split_paragraphs(answer)
    paragraphs = [p for p in paragraphs
                  if _paragraph_contains(p, keyword,
                                         insensitive=insensitive,
                                         word_boundaries=word_boundaries)]
    if paragraphs == []:
        return ""

    answer = _join_paragraphs(paragraphs)
    return answer
Exemplo n.º 7
0
import beautifier
string = "'test_safd' At nearly 7,000 words, you probably don\u2019t want to try</p>sadfsadf"

f = open("tmp.txt", "w+")
print string.decode('unicode-escape')
r = beautifier.beautify(string)
f.write(r)
print r
Exemplo n.º 8
0
def read_title(f):
    par = read_paragraph(f)
    if par is not None:
        return beautify(par.strip())
Exemplo n.º 9
0
def extract_tags(s, tags):
    l = re.split(r"[()]", s)
    if len(l) < 2:
        return s
    tags += [beautify(p.strip()) for p in l[1].split(',')];
    return l[0]