Exemplo n.º 1
0
def cleaning():
    print('\n\n\ncleaning...')
    instance_of_workspace = Cleaner(list_of_paths_to_process)
    instance_of_workspace.show_basic_info()
    instance_of_workspace.save_df_as_image('output_images/data_as_is.png')
    instance_of_workspace.drop_redundant_data()
    instance_of_workspace.save_df_as_image('output_images/cleared_data.png')
Exemplo n.º 2
0
def pre_process():
    cleaning = Cleaner()
    engineering = FeatureEngineer()

    # Cleaning
    train_df, test_df, magic_df = load_data(DATA_DIR)
    train_df = train_df.iloc[:300]
    test_df = test_df.iloc[:100]
    len_train = len(train_df)
    train_df = cleaning.train(train_df)
    test_df = cleaning.test(test_df)
    magic_df = cleaning.magic(magic_df)
    logger.info('Finished cleaning...')

    # Feature Engineering
    df = pd.concat([train_df, test_df], sort=False)
    df = pd.merge(df, magic_df, on='listing_id')
    df = engineering.basic(df)
    df = engineering.manager_id(df)
    df = engineering.location(df)
    logger.info('Finished feature engineering...')

    # Target Encoding
    train_df, test_df = df.iloc[:len_train], df.iloc[len_train:]
    train_df, test_df = target_encoder(train_df, test_df)
    logger.info('Finished target encoding...')
    
    return train_df, test_df
Exemplo n.º 3
0
    def __init__(self, a, dest, c=1):
        self.dest = dest
        s = Publi()
        if a['source'] not in s.codex.keys():
            prefix = "EUROPRESSE"
            source = a['source']
            source_type = "unknown source"
        else:
            prefix = s.codex[a['source']]['abr']
            source = s.codex[a['source']]['source']
            source_type = s.codex[a['source']]['type']

        self.filename = self.file_name(a['date'], prefix)

        text = a['title'] + "\r\n.\r\n"
        text += a['subtitle'] + "\r\n.\r\n" if a['subtitle'] else ""
        text += a['text']

        ctx = [
            "fileCtx0005",
            a['title'],
            source,
            "","",
            a['date'],
            source,
            source_type,
            "", "", "",
            "Processed by Tiresias on %s"\
                % datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            "", "n","n", ""
            ]
        ctx = "\r\n".join(ctx)

        if (c):
            cl_txt = Cleaner(text.encode('utf-8'))
            text = cl_txt.content.encode('latin-1',
                                         'xmlcharrefreplace')  #to bytes
            cl_ctx = Cleaner(ctx.encode('utf-8'))
            ctx = cl_ctx.content.encode('latin-1',
                                        'xmlcharrefreplace')  #to bytes
        else:
            ctx = ctx.encode('latin-1', 'xmlcharrefreplace')  #to bytes
            text = text.encode('latin-1', 'xmlcharrefreplace')  #to bytes

        path = os.path.join(self.dest, self.filename + ".txt")
        with open(path, 'wb') as f:
            f.write(text)

        path = os.path.join(self.dest, self.filename + ".ctx")
        with open(path, 'wb') as f:
            f.write(ctx)
Exemplo n.º 4
0
    def write_prospero_files(self, save_dir=".", cleaning=False):
        """for each article, write txt and ctx in a given directory"""
        for article in self.articles.values():
            filepath = file_name(article['date'], article['root'], save_dir)
            path = os.path.join(save_dir, filepath + ".txt")

            article['text'] = article['title'] + "\r\n.\r\n" + article['text']
            if cleaning:
                text_cleaner = Cleaner(article['text'].encode('utf-8'))
                text = text_cleaner.content
            else:
                text = article['text']
            with open(path, 'wb') as file:
                #to bytes
                file.write(text.encode('latin-1', 'xmlcharrefreplace'))
            ctx = [
                "fileCtx0005",
                article['title'],
                article['support'],
                "", "",
                article['date'],
                "",
                article['source_type'],
                "", "", "",
                "Processed by Tiresias on %s"\
                    % datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                "", "n", "n", ""
                ]
            ctx = "\r\n".join(ctx)
            ctx = ctx.encode('latin-1', 'xmlcharrefreplace')  #to bytes
            path = os.path.join(save_dir, filepath + ".ctx")
            with open(path, 'wb') as file:
                file.write(ctx)
Exemplo n.º 5
0
def write_txt(path, text):
    text_cleaner = Cleaner(text.encode('utf-8'))
    text = text_cleaner.content.encode('latin-1',
                                       'xmlcharrefreplace')  #to bytes
    #text = text.encode('latin1', errors='xmlcharrefreplace')
    with open(path, 'wb') as f:
        f.write(text)
Exemplo n.º 6
0
    def __init__(self, url):
        dest = "C:\\corpus\\EnergiCorpus\\FR\\TEE\\"
        with urllib.request.urlopen(url) as page:
            soup = BeautifulSoup(page, "lxml")
        title = soup.title.string
        author = soup.find("div", "meta-author").text
        date = soup.find("div", "meta-date").text
        title = re.sub(" - Transitions & Energies", "", title)
        print(title, author, date)
        content = title +  "\r\n.\r\n\r\n"
        article = soup.find('article')
        for el in article.find_all(['h2', 'p']):
            if el.name == "h2":
                content += "\r\n\r\n" + el.text  + "\r\n.\r\n"
            else:
                content += el.text

        date = formate_date(date)
        ctx = formate_ctx(title, date, url)

        ctx_cleaner =  Cleaner(ctx.encode('utf-8'))
        ctx = ctx_cleaner.content.encode('latin-1', 'xmlcharrefreplace') #to bytes

        text_cleaner =  Cleaner(content.encode('utf-8'))
        text = text_cleaner.content.encode('latin-1', 'xmlcharrefreplace') #to bytes


        filename = file_name(dest, date, "TEE")
        
        path = os.path.join(dest, filename + ".txt")
        with open(path, 'wb') as f:
            f.write(text)
            
        path = os.path.join(dest, filename + ".ctx")        
        with open(path, 'wb') as f:
            f.write(ctx)
Exemplo n.º 7
0
def create_ctx(path, metadata):
    ctx = [
        "fileCtx0005",
        metadata['title'],
        metadata['authors'],
        "",
        "",
        "01/01/" + metadata['date'],
        "",
        "chapitre",
        metadata['ref'],
        "",
        "",
        "Processed by Tiresias on %s"\
        % datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        "", "n", "n", ""
        ]
    ctx = "\r\n".join(ctx)
    ctx_cleaner = Cleaner(ctx.encode('utf-8'))
    ctx = ctx = ctx_cleaner.content.encode('latin-1',
                                           'xmlcharrefreplace')  #to bytes
    with open(path, 'wb') as file:
        file.write(ctx)
Exemplo n.º 8
0
def ctx_prospero(
    csvfile,
    save_dir=".",
    cleaning=False,
    brackets=False,
    author_keywords=False,
    index_keywords=False,
    rm_copyright=False,
):
    """convert ctx to prospero format files"""
    reader = csv.DictReader(csvfile, delimiter=",")
    papers = {}
    file_count = 0
    no_abstract = 0
    for row in reader:
        link = row['Link']
        eid = re.search(r'eid=([^\&]*)\&', link).group(1)
        if row['Abstract'] == "[No abstract available]":
            no_abstract += 1
        else:
            papers[eid] = [
                row['Authors'], row['Title'], row['Year'], row['Abstract'],
                row['Author Keywords'], row['Index Keywords']
            ]

    for eid in papers:
        #remove the traductions between [] in title
        if brackets:
            papers[eid][1] = re.sub(r"\[.*\]$", "", papers[eid][1])

        #put the title at the beginning of the text
        txt_content = papers[eid][1] + "\r\n.\r\n"

        #put author keywords
        if author_keywords:
            if papers[eid][4]:
                txt_content += papers[eid][4] + "\r\n.\r\n"

        #put index keywords
        if index_keywords:
            if papers[eid][5]:
                txt_content += papers[eid][5] + "\r\n.\r\n"

        #remove ©
        if rm_copyright:
            papers[eid][3] = re.sub(" (©|Copyright),? \d{4},? .*$", "",
                                    papers[eid][3])

        #put text content
        txt_content += papers[eid][3]

        if cleaning:
            text_cleaner = Cleaner(txt_content.encode('utf-8'))
            txt_content = text_cleaner.content
        txt_content = txt_content.encode('latin-1',
                                         'xmlcharrefreplace')  #to bytes
        filename = os.path.join(save_dir, eid)
        with open("%s.txt" % filename, 'wb') as txtfile:
            txtfile.write(txt_content)
            file_count += 1

        ctx = [
            "fileCtx0005",
            papers[eid][1],
            papers[eid][0],
            "", "",
            "01/01/%s"%papers[eid][2],
            "",
            "",
            "", "", "",
            "From Scopus by Tiresias on %s"\
                % datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            "", "n", "n", ""
            ]
        ctx = "\r\n".join(ctx)
        with open("%s.ctx" % filename, 'wb') as ctxfile:
            ctx = ctx.encode('latin-1', 'xmlcharrefreplace')  #to bytes
            ctxfile.write(ctx)
            file_count += 1

    return file_count, no_abstract
Exemplo n.º 9
0
from cleaning import Cleaner
from Scrapper import Scrapper
import pandas as pd
if __name__ == "__main__":
    cleaner = Cleaner("../cache")
    print("Enter 1800<year<1900")

    year = int(input())
    if year >= 1800 and year <= 1900:
        print("=======> " + str(year))
        arks = Scrapper.get_arks(year)
        for ark in arks:
            print("=======>" + ark)
            print(f"- download {ark}")
            file = Scrapper.get_document(ark)
            print("- Extraction {ark}")
            df = cleaner.extract(file)
            print(f"{df.shape[0]} rows detected")
            print(f"- Post processing {ark}")
            df = cleaner.postProcess(df)
            print("- Spell checking")
            df = cleaner.spell_check(df)
            print("- saving ")
            cleaner.save(df, ark)
            print(" finnished " + ark)
            print("\n")
            del file
            del df