def parse(cls, api, json): result = cls() for k, v in json.items(): if k == 'created_at': setattr(result, k, parse_search_datetime(v)) elif k == 'source': setattr(result, k, parse_html_value(unescape_html(v))) else: setattr(result, k, v) return result
def handleArticle(self, current_page): #print('ARTICLE:', repr(current_page.title)) if self.pass_num == 0: models.articles.save(models.Article(id=current_page['id'], title=current_page['title'], revision_id=current_page['revision_id'])) elif self.pass_num == 1: aid = models.articles.resolve_title(current_page['title']) wikitext__ = utils.unescape_html(current_page['text']) redirect = wikitext.parse_redirect(wikitext__) if redirect: redirect_dest_title, dest_frag, redirect_label = redirect article = models.articles[aid] if redirect_dest_title in models.articles: dest_id = models.articles.resolve_title(redirect_dest_title) else: dest_id = None #print('Broken REDIRECT FROM ', repr(current_page['title']), ' TO ', repr(redirect), file=parse_log) #print(repr(redirect_dest_title), file=parse_log) print >>parse_log, repr(redirect_dest_title) article.redirect = models.Redirect(dest_id, dest_frag, redirect_label) models.articles.save(article) else: #do not parse/allow links if there is a redirect. If there is a redirect, links are meaningless. besides, parse_links does not work properly with redirects. for link_dest_title, dest_frag, link_label, snippet in wikitext.parse_links(wikitext__): lid = self.num_links + 1 #0 is an invalid key #assert lid not in models.links if link_dest_title in models.articles: dest_id = models.articles.resolve_title(link_dest_title) else: dest_id = None #print('Broken LINK FROM ', repr(current_page['title']), ' TO ', repr((link_dest_title, dest_frag, link_label)), file=parse_log) #print(repr(link_dest_title), file=parse_log) print >>parse_log, repr(link_dest_title) models.links.save(models.Link(src_id=aid, dest_id=dest_id, dest_fragment=dest_frag, label=link_label, snippet=snippet, id=lid)) self.num_links += 1 ''' if lid > len(models.links): ... else: self.num_links += 1 #skip ''' self.num_articles += 1 if self.num_articles % 1000 == 0: #print(self.num_articles, self.num_links) print self.num_articles, self.num_links
def _convert_options(self, question_index, options, input_type): if not options: return [] result = ['<form>'] for option in options: option_text = unescape_html( option['display']['definition']['value']) # We need to replace <text> with <span> so that answer text # stays on the same line with checkbox/radio button option_text = self._replace_tag(option_text, 'text', 'span') result.append('<label><input type="%s" name="%s">' '%s<br></label>' % (input_type, question_index, option_text)) result.append('</form>') return result
def __call__(self, quiz_or_exam_json): result = [] for question_index, question_json in enumerate( quiz_or_exam_json['questions']): question_type = question_json['question']['type'] if question_type not in self.KNOWN_QUESTION_TYPES: logging.info('Unknown question type: %s', question_type) logging.info('Question json: %s', question_json) logging.info('Please report class name, quiz name and the data' ' above to coursera-dl authors') prompt = question_json['variant']['definition']['prompt'] options = question_json['variant']['definition'].get('options', []) # Question number result.append('<h3>Question %d</h3>' % (question_index + 1)) # Question text question_text = unescape_html(prompt['definition']['value']) result.append(question_text) # Input for answer if question_type in self.KNOWN_INPUT_TYPES: result.extend(self._generate_input_field()) # Convert input_type from JSON reply to HTML input type input_type = { 'mcq': 'radio', 'mcqReflect': 'radio', 'checkbox': 'checkbox' }.get(question_type, '') # Convert options, they are either checkboxes or radio buttons result.extend( self._convert_options(question_index, options, input_type)) result.append('<hr>') return '\n'.join(result)
def transform_body(body): code_snippets = [] code_hints = [] for item in body.split("</code>"): if "<code>" in item: code_tag = item [item.find("<code>")+len("<code>"):] code_tag = utils.unescape_html(code_tag) if "." in code_tag and "(" in code_tag: code_snippets.append(code_tag) if "<pre" not in item and len(code_tag) < 25: # Heuristic to determine if code_tag is enclosed in inline code block code_hints.append(code_tag) elif len(code_tag) < 25: code_hints.append(code_tag) l = [] for code_hint in code_hints: l.extend( utils.tokenize(code_hint) ) code_hints = set(l) # parsers = [JDTParser(code_snippet, parse) for code_snippet in code_snippets] # futures = pool.invokeAll(parsers) # asts = [ future.get(3, TimeUnit.SECONDS).result for future in futures] #asts = [parse(code_snippet, resolve=False) for code_snippet in code_snippets] asts = [] for code_snippet in code_snippets: ast = parse(code_snippet, resolve=True) if ast: asts.append(ast) return asts, code_hints
ln += 1 currentline = self.file_content_lines[ln].strip() return ln def highlight_matched_terms(self, gitsearch_item_html): html_template = '<span class="hll">%s</span>' html = gitsearch_item_html for term in self.matched_terms: pattern = re.compile(r'\b%s\b' % term, re.IGNORECASE) #html = html.replace(term, html_template % term) html = pattern.sub(html_template % term, html) #print "term %s, replaced: %s" % (term, html_template % term) # Check if matched term is qualified and if it has already been something replaced if "." in term: for token in term.split("."): html = pattern.sub(html_template % token, html) return html if __name__ == '__main__': path = "/Users/Raphael/Downloads/GitArchive/linkedin_indextank-engine/indextank-engine/lucene-experimental/com/flaptor/org/apache/lucene/util/automaton/UTF32ToUTF8.java" matched_terms = [u'Integer.toBinaryString', u'Integer'] #i = GitSearchItem(path, matched_terms) file_content = read_file(path) print unescape_html( highlight(file_content, JavaLexer(), MyHtmlFormatter(linenos=True)))
def add_code_into_document(document, body): asts, code_hints = transform_body(body) flag = False #typed_method_call = set() for ast in asts: for mc in ast["typed_method_call"]: if mc: document.add( Field("typed_method_call", mc, Field.Store.YES, Field.Index.ANALYZED)) flag = True for e in ast["extends"]: if e: document.add( Field("extends", e, Field.Store.YES, Field.Index.ANALYZED)) for c in ast["used_classes"]: if c: document.add( Field("used_classes", c, Field.Store.YES, Field.Index.ANALYZED)) for m in ast["methods"]: if m: document.add( Field("methods", m, Field.Store.YES, Field.Index.ANALYZED)) flag = True for m in ast["methods_called"]: if m: document.add( Field("methods_called", m, Field.Store.YES, Field.Index.ANALYZED)) flag = True #comment if "comments" in ast: for c in ast["comments"]: document.add( Field("comments", utils.unescape_html(c), Field.Store.NO, Field.Index.ANALYZED)) for i in ast["class_instance_creation"]: if i: document.add( Field("class_instance_creation", i, Field.Store.YES, Field.Index.ANALYZED)) flag = True for l in ast["literals"]: if l: document.add(StringField("literals", l, Field.Store.YES)) #finally all the splitted words # for s in camel_case: # document.add( Field("camel_case_words", s.lower(), Field.Store.NO, Field.Index.NOT_ANALYZED)) hints = [] for h in code_hints: for token in utils.tokenize(h): if 1 < len(token) < 20: hints.append(token) for hint in set(hints): document.add( Field("code_hints", hint, Field.Store.YES, Field.Index.ANALYZED)) return flag
def handleArticle(self, current_page): #print('ARTICLE:', repr(current_page.title)) if self.pass_num == 0: models.articles.save( models.Article(id=current_page['id'], title=current_page['title'], revision_id=current_page['revision_id'])) elif self.pass_num == 1: aid = models.articles.resolve_title(current_page['title']) wikitext__ = utils.unescape_html(current_page['text']) redirect = wikitext.parse_redirect(wikitext__) if redirect: redirect_dest_title, dest_frag, redirect_label = redirect article = models.articles[aid] if redirect_dest_title in models.articles: dest_id = models.articles.resolve_title( redirect_dest_title) else: dest_id = None #print('Broken REDIRECT FROM ', repr(current_page['title']), ' TO ', repr(redirect), file=parse_log) #print(repr(redirect_dest_title), file=parse_log) print >> parse_log, repr(redirect_dest_title) article.redirect = models.Redirect(dest_id, dest_frag, redirect_label) models.articles.save(article) else: #do not parse/allow links if there is a redirect. If there is a redirect, links are meaningless. besides, parse_links does not work properly with redirects. for link_dest_title, dest_frag, link_label, snippet in wikitext.parse_links( wikitext__): lid = self.num_links + 1 #0 is an invalid key #assert lid not in models.links if link_dest_title in models.articles: dest_id = models.articles.resolve_title( link_dest_title) else: dest_id = None #print('Broken LINK FROM ', repr(current_page['title']), ' TO ', repr((link_dest_title, dest_frag, link_label)), file=parse_log) #print(repr(link_dest_title), file=parse_log) print >> parse_log, repr(link_dest_title) models.links.save( models.Link(src_id=aid, dest_id=dest_id, dest_fragment=dest_frag, label=link_label, snippet=snippet, id=lid)) self.num_links += 1 ''' if lid > len(models.links): ... else: self.num_links += 1 #skip ''' self.num_articles += 1 if self.num_articles % 1000 == 0: #print(self.num_articles, self.num_links) print self.num_articles, self.num_links