Exemplo n.º 1
0
 def parse(cls, api, json):
     result = cls()
     for k, v in json.items():
         if k == 'created_at':
             setattr(result, k, parse_search_datetime(v))
         elif k == 'source':
             setattr(result, k, parse_html_value(unescape_html(v)))
         else:
             setattr(result, k, v)
     return result
Exemplo n.º 2
0
 def parse(cls, api, json):
     result = cls()
     for k, v in json.items():
         if k == 'created_at':
             setattr(result, k, parse_search_datetime(v))
         elif k == 'source':
             setattr(result, k, parse_html_value(unescape_html(v)))
         else:
             setattr(result, k, v)
     return result
Exemplo n.º 3
0
    def handleArticle(self, current_page):
        #print('ARTICLE:', repr(current_page.title))

        if self.pass_num == 0:
            models.articles.save(models.Article(id=current_page['id'], title=current_page['title'], revision_id=current_page['revision_id']))
        elif self.pass_num == 1:
            aid = models.articles.resolve_title(current_page['title'])            
            wikitext__ = utils.unescape_html(current_page['text'])
            
            redirect = wikitext.parse_redirect(wikitext__)
            if redirect:
                redirect_dest_title, dest_frag, redirect_label = redirect
                article = models.articles[aid]
                if redirect_dest_title in models.articles:
                    dest_id = models.articles.resolve_title(redirect_dest_title)
                else:
                    dest_id = None
                    #print('Broken REDIRECT FROM ', repr(current_page['title']), ' TO ', repr(redirect), file=parse_log) 
                    #print(repr(redirect_dest_title), file=parse_log)
                    print >>parse_log, repr(redirect_dest_title)
                article.redirect = models.Redirect(dest_id, dest_frag, redirect_label)
                models.articles.save(article)
            else:
                #do not parse/allow links if there is a redirect. If there is a redirect, links are meaningless. besides, parse_links does not work properly with redirects.
                for link_dest_title, dest_frag, link_label, snippet in wikitext.parse_links(wikitext__):
                    lid = self.num_links + 1 #0 is an invalid key
                    #assert lid not in models.links
                    if link_dest_title in models.articles:
                        dest_id = models.articles.resolve_title(link_dest_title)
                    else:
                        dest_id = None
                        #print('Broken LINK FROM ', repr(current_page['title']), ' TO ', repr((link_dest_title, dest_frag, link_label)), file=parse_log) 
                        #print(repr(link_dest_title), file=parse_log)
                        print >>parse_log, repr(link_dest_title)
                    models.links.save(models.Link(src_id=aid, dest_id=dest_id, dest_fragment=dest_frag,
                        label=link_label, snippet=snippet, id=lid))
                    self.num_links += 1
                    '''
                    if lid > len(models.links):
                        ...
                    else:
                        self.num_links += 1 #skip
                    '''
                        
        self.num_articles += 1

        if self.num_articles % 1000 == 0:
            #print(self.num_articles, self.num_links)
            print self.num_articles, self.num_links
Exemplo n.º 4
0
    def _convert_options(self, question_index, options, input_type):
        if not options:
            return []

        result = ['<form>']

        for option in options:
            option_text = unescape_html(
                option['display']['definition']['value'])

            # We need to replace <text> with <span> so that answer text
            # stays on the same line with checkbox/radio button
            option_text = self._replace_tag(option_text, 'text', 'span')
            result.append('<label><input type="%s" name="%s">'
                          '%s<br></label>' %
                          (input_type, question_index, option_text))

        result.append('</form>')
        return result
Exemplo n.º 5
0
    def __call__(self, quiz_or_exam_json):
        result = []

        for question_index, question_json in enumerate(
                quiz_or_exam_json['questions']):
            question_type = question_json['question']['type']
            if question_type not in self.KNOWN_QUESTION_TYPES:
                logging.info('Unknown question type: %s', question_type)
                logging.info('Question json: %s', question_json)
                logging.info('Please report class name, quiz name and the data'
                             ' above to coursera-dl authors')

            prompt = question_json['variant']['definition']['prompt']
            options = question_json['variant']['definition'].get('options', [])

            # Question number
            result.append('<h3>Question %d</h3>' % (question_index + 1))

            # Question text
            question_text = unescape_html(prompt['definition']['value'])
            result.append(question_text)

            # Input for answer
            if question_type in self.KNOWN_INPUT_TYPES:
                result.extend(self._generate_input_field())

            # Convert input_type from JSON reply to HTML input type
            input_type = {
                'mcq': 'radio',
                'mcqReflect': 'radio',
                'checkbox': 'checkbox'
            }.get(question_type, '')

            # Convert options, they are either checkboxes or radio buttons
            result.extend(
                self._convert_options(question_index, options, input_type))

            result.append('<hr>')

        return '\n'.join(result)
Exemplo n.º 6
0
def transform_body(body):
	code_snippets = []
	code_hints = []
	for item in body.split("</code>"):
		if "<code>" in item:
			code_tag = item [item.find("<code>")+len("<code>"):]
			code_tag = utils.unescape_html(code_tag)
			if "." in code_tag and "(" in code_tag:
				code_snippets.append(code_tag)

				if "<pre" not in item and len(code_tag) < 25: # Heuristic to determine if code_tag is enclosed in inline code block
					code_hints.append(code_tag)
			elif len(code_tag) < 25:
				code_hints.append(code_tag)

	l = []
	for code_hint in code_hints:
		l.extend( utils.tokenize(code_hint) )

	code_hints = set(l)

	# parsers = [JDTParser(code_snippet, parse) for code_snippet in code_snippets]

	# futures = pool.invokeAll(parsers)

	# asts = [ future.get(3, TimeUnit.SECONDS).result for future in futures]

	#asts = [parse(code_snippet, resolve=False) for code_snippet in code_snippets]

	asts = []
	for code_snippet in code_snippets:
		ast = parse(code_snippet, resolve=True)
		if ast:
			asts.append(ast)
	

	return asts, code_hints
Exemplo n.º 7
0
                    ln += 1
                    currentline = self.file_content_lines[ln].strip()

        return ln

    def highlight_matched_terms(self, gitsearch_item_html):
        html_template = '<span class="hll">%s</span>'
        html = gitsearch_item_html
        for term in self.matched_terms:
            pattern = re.compile(r'\b%s\b' % term, re.IGNORECASE)

            #html = html.replace(term, html_template % term)
            html = pattern.sub(html_template % term, html)
            #print "term %s, replaced: %s" % (term, html_template % term)
            # Check if matched term is qualified and if it has already been something replaced
            if "." in term:
                for token in term.split("."):
                    html = pattern.sub(html_template % token, html)

        return html


if __name__ == '__main__':
    path = "/Users/Raphael/Downloads/GitArchive/linkedin_indextank-engine/indextank-engine/lucene-experimental/com/flaptor/org/apache/lucene/util/automaton/UTF32ToUTF8.java"
    matched_terms = [u'Integer.toBinaryString', u'Integer']
    #i = GitSearchItem(path, matched_terms)
    file_content = read_file(path)

    print unescape_html(
        highlight(file_content, JavaLexer(), MyHtmlFormatter(linenos=True)))
Exemplo n.º 8
0
def add_code_into_document(document, body):
    asts, code_hints = transform_body(body)

    flag = False

    #typed_method_call = set()
    for ast in asts:
        for mc in ast["typed_method_call"]:
            if mc:
                document.add(
                    Field("typed_method_call", mc, Field.Store.YES,
                          Field.Index.ANALYZED))
                flag = True

        for e in ast["extends"]:
            if e:
                document.add(
                    Field("extends", e, Field.Store.YES, Field.Index.ANALYZED))

        for c in ast["used_classes"]:
            if c:
                document.add(
                    Field("used_classes", c, Field.Store.YES,
                          Field.Index.ANALYZED))

        for m in ast["methods"]:
            if m:
                document.add(
                    Field("methods", m, Field.Store.YES, Field.Index.ANALYZED))
                flag = True

        for m in ast["methods_called"]:
            if m:
                document.add(
                    Field("methods_called", m, Field.Store.YES,
                          Field.Index.ANALYZED))
                flag = True

        #comment
        if "comments" in ast:
            for c in ast["comments"]:
                document.add(
                    Field("comments", utils.unescape_html(c), Field.Store.NO,
                          Field.Index.ANALYZED))

        for i in ast["class_instance_creation"]:
            if i:
                document.add(
                    Field("class_instance_creation", i, Field.Store.YES,
                          Field.Index.ANALYZED))
                flag = True

        for l in ast["literals"]:
            if l:
                document.add(StringField("literals", l, Field.Store.YES))

        #finally all the splitted words
        # for s in camel_case:
        # 	document.add( Field("camel_case_words", s.lower(), Field.Store.NO, Field.Index.NOT_ANALYZED))

    hints = []
    for h in code_hints:
        for token in utils.tokenize(h):
            if 1 < len(token) < 20:
                hints.append(token)

    for hint in set(hints):
        document.add(
            Field("code_hints", hint, Field.Store.YES, Field.Index.ANALYZED))

    return flag
Exemplo n.º 9
0
    def handleArticle(self, current_page):
        #print('ARTICLE:', repr(current_page.title))

        if self.pass_num == 0:
            models.articles.save(
                models.Article(id=current_page['id'],
                               title=current_page['title'],
                               revision_id=current_page['revision_id']))
        elif self.pass_num == 1:
            aid = models.articles.resolve_title(current_page['title'])
            wikitext__ = utils.unescape_html(current_page['text'])

            redirect = wikitext.parse_redirect(wikitext__)
            if redirect:
                redirect_dest_title, dest_frag, redirect_label = redirect
                article = models.articles[aid]
                if redirect_dest_title in models.articles:
                    dest_id = models.articles.resolve_title(
                        redirect_dest_title)
                else:
                    dest_id = None
                    #print('Broken REDIRECT FROM ', repr(current_page['title']), ' TO ', repr(redirect), file=parse_log)
                    #print(repr(redirect_dest_title), file=parse_log)
                    print >> parse_log, repr(redirect_dest_title)
                article.redirect = models.Redirect(dest_id, dest_frag,
                                                   redirect_label)
                models.articles.save(article)
            else:
                #do not parse/allow links if there is a redirect. If there is a redirect, links are meaningless. besides, parse_links does not work properly with redirects.
                for link_dest_title, dest_frag, link_label, snippet in wikitext.parse_links(
                        wikitext__):
                    lid = self.num_links + 1  #0 is an invalid key
                    #assert lid not in models.links
                    if link_dest_title in models.articles:
                        dest_id = models.articles.resolve_title(
                            link_dest_title)
                    else:
                        dest_id = None
                        #print('Broken LINK FROM ', repr(current_page['title']), ' TO ', repr((link_dest_title, dest_frag, link_label)), file=parse_log)
                        #print(repr(link_dest_title), file=parse_log)
                        print >> parse_log, repr(link_dest_title)
                    models.links.save(
                        models.Link(src_id=aid,
                                    dest_id=dest_id,
                                    dest_fragment=dest_frag,
                                    label=link_label,
                                    snippet=snippet,
                                    id=lid))
                    self.num_links += 1
                    '''
                    if lid > len(models.links):
                        ...
                    else:
                        self.num_links += 1 #skip
                    '''

        self.num_articles += 1

        if self.num_articles % 1000 == 0:
            #print(self.num_articles, self.num_links)
            print self.num_articles, self.num_links