Python Parsed 예제들, parsing.parsed.Parsed Python 예제들

예제 #1

0

파일 보기

파일: banken_txt_parser.py 프로젝트: hicsail/corpus

def parse_txt(in_dir, mappings, out_dir):
    """
    Iterate over directory of Banken text files, parse each volume to a JSON object.
    """

    for subdir, dirs, files in os.walk(in_dir):
        for txt_f in tqdm.tqdm(files):

            if txt_f[0] != ".":

                obj = Parsed()

                id_str = txt_f[:-4]
                try:
                    maps = mappings[id_str]
                    obj.a = maps["AUTHOR"]
                    obj.t = maps["TITLE"]
                    obj.y = maps["PUBDATE"]

                    with open(in_dir + txt_f, 'r', encoding='utf-8') as txt_in:
                        for line in txt_in:
                            add_content(line, obj, 'swedish')

                    with open(out_dir + txt_f[:-4] + '.json',
                              'w',
                              encoding='utf-8') as out:
                        out.write(build_json(obj))
                        out.close()
                except KeyError:
                    pass

예제 #2

0

파일 보기

def parse_txt(in_dir, ids, out_dir):
    """
    Iterate over directory of Gutenberg text files, parse each volume to a JSON object.
    """

    for subdir, dirs, files in os.walk(in_dir):
        for txt_f in tqdm.tqdm(files):

            if txt_f[0] != ".":
                reading = False
                obj = Parsed()

                with open(in_dir + txt_f, 'r', encoding='utf-8') as txt_in:
                    for line in txt_in:
                        if 'Posting Date' in line:
                            idno = get_idno(line)
                            pub_info = match_pub_info(idno, ids)
                            obj.a, obj.t, obj.y = pub_info[1], pub_info[
                                2], pub_info[3]
                        if 'START OF THIS PROJECT GUTENBERG EBOOK' in line:
                            reading = True
                        if 'END OF THIS PROJECT GUTENBERG EBOOK' in line:
                            reading = False
                        if reading and 'START OF THIS PROJECT GUTENBERG EBOOK' not in line:
                            add_content(line, obj, 'german')

                with open(out_dir + txt_f[:-4] + '.json',
                          'w',
                          encoding='utf-8') as out:
                    out.write(build_json(obj))
                    out.close()

예제 #3

0

파일 보기

파일: dutch.py 프로젝트: hicsail/corpus

    def _parse_files(self, doc, subdir):
        """
        Parse an individual XML volume.
        """

        try:
            f = open("{0}/{1}".format(self.input_dir, doc), 'r')
        except FileNotFoundError:
            f = open("{0}/{1}".format(subdir, doc), 'r')

        tree = BeautifulSoup(f.read(), 'xml')
        obj = Parsed()
        self.get_text(tree, obj)

        pub_info = self.mapping[doc[:-4]]

        obj.a = pub_info["author"]
        obj.t = pub_info["title"]
        obj.y = pub_info["pub_date"]

        with open("{0}/{1}.json".format(self.output_dir, doc[:-4]),
                  'w',
                  encoding='utf-8') as out:
            out.write(build_json(obj))
            out.close()

        f.close()

예제 #4

0

파일 보기

def parse_txt(in_dir, mappings, out_dir):
    """
    Iterate over directory of Runeberg text files, parse each volume to a JSON object.
    """

    for subdir, dirs, files in os.walk(in_dir):
        for vol in tqdm.tqdm(dirs):

            if vol[0] != "." and vol != "":

                obj = Parsed()

                try:
                    with open("{}/{}/title".format(in_dir, vol),
                              'r') as title_str:
                        id_str = title_str.read()
                    maps = mappings[id_str]
                    valid = True
                except KeyError:
                    valid = False

                if valid:

                    obj.a = maps["AUTHOR"]
                    obj.t = maps["TITLE"]
                    obj.y = maps["PUBDATE"]

                    for subdir, dirs, files in os.walk("{}/{}/Pages/".format(
                            in_dir, vol)):
                        for text_f in files:
                            if text_f != "whole-page-ok.lst" and text_f[
                                    0] != ".":
                                with open(
                                        "{}/{}/Pages/{}".format(
                                            in_dir, vol, text_f),
                                        'r') as txt_in:
                                    for line in txt_in:
                                        add_content(line, obj, 'swedish')

                    with open(out_dir + vol[:-4] + '.json',
                              'w',
                              encoding='utf-8') as out:
                        out.write(build_json(obj))
                        out.close()

예제 #5

0

파일 보기

def parse_threaded(xml_doc, input_doc, output_doc, csv_in):
    refs = get_pub_dates(csv_in)
    tree = ET.parse(input_doc + xml_doc)
    root = tree.getroot()
    obj = Parsed()
    get_text(root, obj)
    text = "".join(obj.c)
    if text != "":
        try:
            with open(output_doc + xml_doc[:-4] + '.json', 'w', encoding='utf-8') as out:
                get_title_and_author(root, obj)
                get_publication_info(root, obj)
                get_isbn(root, obj)
                obj.y = refs[xml_doc]
                doc_type(root, obj)
                get_chapters(root, obj)
                out.write(build_json(obj))
                out.close()
        except IOError:
            pass

예제 #6

0

파일 보기

def parse_files(in_dir, out_dir, htids, language):
    for folder, subfolders, files in os.walk(in_dir):
        if not subfolders:
            for xml_file in files:
                if xml_file[-4:] == ".xml":
                    htid_test = test_file_htid(htids, folder, xml_file)
                    
                    # test if htid in set of htids, store it and build file if true
                    if htid_test[0]:
                        htid = htid_test[1]
                        obj = Parsed()
                        
                        # replace periods for file-naming
                        obj.h = htid.replace(".", "_")
                        
                        try:
                            obj.a = htids[htid][0]
                            obj.t = htids[htid][1]
                            obj.y = htids[htid][2]
                        except KeyError:
                            print("File with HTID {0} not found in CSV reference file.".format(htid))
                        for zip_file in files:
                            if zip_file[-4:] == ".zip":
                                with zipfile.ZipFile(folder + "/" + zip_file, 'r') as zf:
                                    for txt_file in zf.namelist():
                                        if txt_file[-4:] == ".txt":
                                            text = zf.read(txt_file).decode('utf-8')
                                            add_content(text, obj, language)
                            with open(out_dir + str(obj.h) + ".json", 'w', encoding='utf-8') as out:
                                out.write(build_json(obj))

예제 #7

0

파일 보기

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i",
                        metavar='in-directory',
                        action="store",
                        help="input directory argument")
    parser.add_argument("-o", help="output directory argument", action="store")
    parser.add_argument("-csv",
                        help="csv file with publication dates",
                        action="store")

    try:
        args = parser.parse_args()
    except IOError:
        fail("IOError")

    build_out(args.o)

    if args.csv is not None:
        ids = parse_csv(args.csv)
    else:
        fail("Please specify input csv file path")

    for subdir, dirs, files in os.walk(args.i):
        for xmldoc in tqdm.tqdm(files):
            if xmldoc[0] != ".":
                tree = ET.parse(args.i + xmldoc)
                root = tree.getroot()
                base_url = get_id(root)
                obj = Parsed()
                get_text(root, obj)
                if len(obj.c) > 0:
                    pub_info = get_pub_info(ids, base_url)
                    obj.a, obj.t, obj.y = pub_info[0], pub_info[1], pub_info[2]
                    with open(args.o + xmldoc[:-4] + '.json',
                              'w',
                              encoding='utf-8') as out:
                        out.write(build_json(obj))
                        out.close()

예제 #8

0

파일 보기

def build_json(file: Parsed):
    """
    Construct JSON object which represents a volume in a corpus.
    """

    if file.t is None:
        file.t = "No title listed"
    if file.a is None:
        file.a = "No author listed"
    if file.p is None:
        file.p = "No publisher listed"
    if file.i == '':
        file.i = "No ISBN listed"
    if file.d is None:
        file.d = "No document type"
    if file.h is None:
        file.h = "No HTID for this file"

    file.t = file.t.replace("\n", " ")
    file.a = file.a.replace("\n", " ")
    file.p = file.p.replace("\n", " ")
    file.d = file.d.replace("\n", " ")
    file.ch = filter_chapters(file.ch)

    jfile = json.dumps(
        {
            'Title': file.t,
            'Author': file.a,
            'Publisher': file.p,
            'Date': file.y,
            'ISBN': file.i,
            'Document Type': file.d,
            'List of chapters': file.ch,
            'HTID': file.h,
            'Text': file.c,
            'Stemmed': file.cstem,
            'Filtered': file.tx,
            'Filtered Stemmed': file.txstem,
            'Full Sentences': file.c_sent,
            'Filtered Sentences': file.tx_sent,
            'Stemmed Sentences': file.cstem_sent,
            'Filtered Stemmed Sentences': file.txstem_sent,
            'URL': file.url
        },
        sort_keys=True,
        indent=4,
        separators=(',', ': '),
        ensure_ascii=False)
    return jfile

예제 #9

0

파일 보기

def add_bs_xml_content(text: str, f: Parsed, lang: str):
    """
    Add content to Parsed object from BeautifulSoup XML parser output.
    """

    sentences = re.split("[.!?]", text)

    for sentence in sentences:

        s = clean_text(sentence)

        if len(s) > 1:

            f.add_content_sent(" ".join(s))
            s_stem = stem_text(s, lang)
            f.add_stemmed_sent(" ".join(s_stem))
            s_filt = filter_text(s, lang)

            if len(s_filt) > 1:

                f.add_filtered_sent(" ".join(s_filt))
                s_filt_stem = stem_text(s_filt, lang)
                f.add_filtered_stemmed_sent(" ".join(s_filt_stem))

    text_list = clean_text(text)
    f.add_content(text_list)

    stem = stem_text(text_list, lang)
    f.add_stemmed(stem)

    filt = filter_text(text_list, lang)
    f.add_filtered(filt)

    filt_stem = stem_text(filt, lang)
    f.add_filtered_stemmed(filt_stem)

예제 #10

0

파일 보기

def add_xml_content(root, file: Parsed, language: str):
    """
    Transforms text from xml file into raw/filtered/stemmed forms and adds it to a file object.
    """

    text = ''
    if str(root.text) != 'None':
        text += root.text

    if str(root.tail) != 'None':
        text += ' ' + root.tail

    if text != '':
        sentences = re.split('(?<=[.!?]) +', text)

        for sentence in sentences:
            sentence = clean_text(sentence)

            if len(sentence) > 1:
                file.add_content_sent(" ".join(sentence))
                sentence_stemmed = stem_text(sentence, language)
                file.add_stemmed_sent(" ".join(sentence_stemmed))
                sentence_filtered = filter_text(sentence, language)

                if len(sentence_filtered) > 1:
                    file.add_filtered_sent(" ".join(sentence_filtered))
                    sentence_filtered_stemmed = stem_text(
                        sentence_filtered, language)
                    file.add_filtered_stemmed_sent(
                        " ".join(sentence_filtered_stemmed))

        text_list = clean_text(text)

        # full text
        file.add_content(text_list)

        # stem the full text
        stemmed = stem_text(text_list, language)
        file.add_stemmed(stemmed)

        # filter the unstemmed full text
        filtered = filter_text(text_list, language)
        file.add_filtered(filtered)

        # stem the filtered text
        filtered_stemmed = stem_text(filtered, language)
        file.add_filtered_stemmed(filtered_stemmed)

예제 #11

0

파일 보기

def add_content(text: str, file: Parsed, language: str):
    """
    Transforms text into raw/filtered/stemmed forms and adds it to a file object.
    """

    sentences = re.split('(?<=[.!?]) +', text)

    for sentence in sentences:
        sentence = clean_text(sentence)

        if len(sentence) > 1:
            file.add_content_sent(" ".join(sentence))
            sentence_stemmed = stem_text(sentence, language)
            file.add_stemmed_sent(" ".join(sentence_stemmed))
            sentence_filtered = filter_text(sentence, language)

            if len(sentence_filtered) > 1:
                file.add_filtered_sent(" ".join(sentence_filtered))
                sentence_filtered_stemmed = stem_text(sentence_filtered,
                                                      language)
                file.add_filtered_stemmed_sent(
                    " ".join(sentence_filtered_stemmed))

    text_list = clean_text(text)

    # full text
    file.add_content(text_list)

    # stem the full text
    stemmed = stem_text(text_list, language)
    file.add_stemmed(stemmed)

    # filter the unstemmed full text
    filtered = filter_text(text_list, language)
    file.add_filtered(filtered)

    # stem the filtered text
    filtered_stemmed = stem_text(filtered, language)
    file.add_filtered_stemmed(filtered_stemmed)