示例#1
0
def run_parse(files, doctype='grant'):
    import parse
    import time
    import sys
    import itertools
    import lib.alchemy as alchemy
    import logging
    logfile = "./" + 'xml-parsing.log'
    logging.basicConfig(filename=logfile, level=logging.DEBUG)
    parse.parse_files(files, doctype)
示例#2
0
def run_parse(files, doctype='grant'):
    import parse
    import time
    import sys
    import itertools
    import lib.alchemy as alchemy
    import logging
    logfile = "./" + 'xml-parsing.log'
    logging.basicConfig(filename=logfile, level=logging.DEBUG)
    parse.parse_files(files, doctype)
示例#3
0
def save_rust_repo(srcpath, **kwargs):
    'save rust and formats to database'
    import parse
    path = os.path.join(srcpath, 'sourcefiles.txt')
    with open(path, 'rU') as ifile:
        srcfiles = [os.path.join(srcpath, s.strip()) for s in ifile]
    tree = parse.parse_files(srcfiles)
    docs = extract_docs(tree)
    insert_docs(docs, **kwargs)
    print 'inserted %d documents' % len(docs)
    path = os.path.join(srcpath, 'formats.rst')
    tree = parse.parse_files([path])
    formats = extract_formats(tree)
    insert_formats(formats, **kwargs)
    print 'inserted %d formats' % len(formats)
 def test_parse_patent(self):
     testdir = os.path.join(basedir, './fixtures/xml')
     xmlregex = r'ipg120327.one.xml'
     filelist = parse.list_files(testdir, xmlregex)
     grant_list = list(parse.parse_files(filelist))
     parsed_grants = list(parse.parse_patents(grant_list))
     self.assertTrue(len(parsed_grants) == len(grant_list)*len(xmlclasses))
    def __init__(self, name, output_path):
        self.name = name
        self.output_path = output_path
        self.files = glob(oce_include + "/" + name + "_*.hxx")
        self.files = filter(lambda h: not ignore(h), self.files)

        self.ns = parse.parse_files(oce_include, self.files)
 def test_use_parse_files_one(self):
     filelist = [testdir+testfileone]
     parsed_output = list(parse.parse_files(filelist))
     patobj = PatentGrant(parsed_output[0][1], True)
     parsed_xml = [xmlclass(patobj) for xmlclass in xmlclasses]
     self.assertTrue(len(parsed_xml) == len(xmlclasses))
     self.assertTrue(all(parsed_xml))
 def test_parse_files_one(self):
     filelist = [testdir+testfileone]
     parsed_output = parse.parse_files(filelist)
     self.assertTrue(isinstance(parsed_output,Iterable))
     parsed_output = list(parsed_output)
     self.assertTrue(len(parsed_output) == 1)
     self.assertTrue(isinstance(parsed_output[0], tuple))
     self.assertTrue(isinstance(parsed_output[0][1], str))
     self.assertTrue(regex.match(parsed_output[0][1]))
示例#8
0
def run_parse():
    import parse
    import time
    import sys
    import itertools
    parsed_xmls = parse.parse_files(files)
    parsed_grants = parse.parse_patents(parsed_xmls)
    parse.build_tables(parsed_grants)
    return parse.get_inserts()
 def test_use_parse_files_two(self):
     filelist = [testdir+testfiletwo]
     parsed_output = parse.parse_files(filelist)
     parsed_xml = []
     for us_patent_grant in parsed_output:
         self.assertTrue(isinstance(us_patent_grant, tuple))
         self.assertTrue(isinstance(us_patent_grant[1], str))
         patobj = grant_handler_v42.PatentGrant(us_patent_grant[1], True)
         self.assertTrue(patobj)
 def test_use_parse_files_two(self):
     filelist = [testdir + testfiletwo]
     parsed_output = parse.parse_files(filelist)
     parsed_xml = []
     for us_patent_grant in parsed_output:
         self.assertTrue(isinstance(us_patent_grant, tuple))
         self.assertTrue(isinstance(us_patent_grant[1], str))
         patobj = grant_handler_v42.PatentGrant(us_patent_grant[1], True)
         self.assertTrue(patobj)
 def test_parse_files_one(self):
     filelist = [testdir + testfileone]
     parsed_output = parse.parse_files(filelist)
     self.assertTrue(isinstance(parsed_output, Iterable))
     parsed_output = list(parsed_output)
     self.assertTrue(len(parsed_output) == 1)
     self.assertTrue(isinstance(parsed_output[0], tuple))
     self.assertTrue(isinstance(parsed_output[0][1], str))
     self.assertTrue(regex.match(parsed_output[0][1]))
 def test_use_parse_files_two(self):
     filelist = [testdir+testfiletwo]
     parsed_output = parse.parse_files(filelist)
     parsed_xml = []
     for us_patent_grant in parsed_output:
         self.assertTrue(isinstance(us_patent_grant, tuple))
         self.assertTrue(isinstance(us_patent_grant[1], str))
         patobj = PatentGrant(us_patent_grant[1], True)
         for xmlclass in xmlclasses:
             parsed_xml.append(xmlclass(patobj))
     self.assertTrue(len(parsed_xml) == 2 * len(xmlclasses))
     self.assertTrue(all(parsed_xml))
示例#13
0
def scrape(path):
    pyfiles = []
    dirs = [path]

    while len(dirs) != 0:
        d = dirs.pop(-1)
        for rel in os.listdir(d):
            f = os.path.join(d, rel)
            if os.path.islink(f):
                continue
            elif os.path.isdir(f):
                dirs.append(f)
            elif os.path.isfile(f) and f.endswith('.py'):
                pyfiles.append(f)

    return parse_files(pyfiles)
示例#14
0
def save_concept_lessons_csv(ctfiles, csvfile, func=flag_rst_images,
                             blocks=(':warning:', ':comment:', ':derivation:',
                                     ':intro:', ':informal-definition:',
                                     ':formal-definition:')):
    tree = parse.parse_files(ctfiles)
    with codecs.open(csvfile, 'w', encoding='utf-8') as ofile:
        writer = csv.writer(ofile)
        for error in defaultErrorModels:
            save_generic_error(error, writer)
        for i, lesson in enumerate(tree.walk()): # assign node IDs
            lesson.nodeID = i
        for lesson in tree.walk():
            if not hasattr(lesson, 'tokens'):
                continue
            metadata = lesson.metadata_dict()
            if 'fallacy' in metadata.get('conceptType', ()) \
              or 'violates' in metadata:
                save_concept_error(lesson, metadata, writer, func)
                continue # do not generate concept links
            elif lesson.tokens[0] == 'section':
                save_section_csv(lesson, metadata, writer, func)
            elif lesson.tokens[0] in blocks:
                if len(lesson.tokens) >= 2:
                    save_section_csv(lesson, metadata, writer, func,
                                    lesson.tokens[0][1:-1], lesson.tokens[1])
                elif lesson.parent and getattr(lesson.parent, 'conceptID', 0):
                    save_section_csv(lesson, metadata, writer, func,
                                     lesson.tokens[0][1:-1],
                                     lesson.parent.conceptID)
            elif lesson.tokens[0] == ':question:':
                if is_multipart_question(lesson):
                    save_section_csv(lesson, metadata, writer, func)
                else:
                    save_question_csv2(lesson, writer, func)
            for relation, conceptID in get_concept_links(metadata):
                if relation == 'defines':
                    lesson.conceptID = conceptID
                writer.writerow(('conceptlink', lesson.nodeID, relation,
                                 conceptID))
示例#15
0
 def test_use_parse_files_one(self):
     filelist = [testdir+testfileone]
     parsed_output = list(parse.parse_files(filelist))
     patobj = grant_handler_v42.PatentGrant(parsed_output[0][1], True)
     self.assertTrue(patobj)
 def test_use_parse_files_one(self):
     filelist = [testdir + testfileone]
     parsed_output = list(parse.parse_files(filelist))
     patobj = grant_handler_v42.PatentGrant(parsed_output[0][1], True)
     self.assertTrue(patobj)