Python normalize_text示例，util.normalize_text Python示例

示例#1

0

显示文件

def process_french(df, fr):
    df = normalize_text(df, ['Location', 'Short Project Name', 'Theme(s)'])
    en = df.copy()
    en = en[en['Short Project Name'] != "SAM/COM"].copy().reset_index(
        drop=True)
    fr = fr.rename(
        columns={
            "Société": "Company",
            "Nom du projet": "Project Name",
            "Nom du projet court": "Short Project Name",
            "État du projet": "Project Status",
            "Instrument no": "Instrument Number",
            "Activité liée à l'instrument": "Instrument Activity",
            "Entrée en vigueur": "Effective Date",
            "Date de délivrance": "Issuance Date",
            "Date de réexamen": "Sunset Date",
            "État de l'instrument": "Instrument Status",
            "Lieu": "Location",
            "Condition No": "Condition Number",
            "Condition": "Condition",
            "État de condition": "Condition Status",
            "Étape de condition": "Condition Phase",
            "Type de Condition": "Condition Type",
            "Dépôt pour condition": "Condition Filing",
            "Thème(s)": "Theme(s)"
        })
    fr = fr[fr['Short Project Name'] != "SAM/COM"].copy().reset_index(
        drop=True)
    fr['Company'] = fr['Company'].replace(company_rename())

    en = normalize_text(en, [
        'Location', 'Short Project Name', 'Theme(s)', 'Condition Number',
        'Instrument Number'
    ])
    fr = normalize_text(fr, [
        'Location', 'Short Project Name', 'Theme(s)', 'Condition Number',
        'Instrument Number'
    ])
    fr['french id'] = [
        str(ins).strip() + '_' + str(cond).strip()
        for ins, cond in zip(fr['Instrument Number'], fr['Condition Number'])
    ]
    en['english id'] = [
        str(ins).strip() + '_' + str(cond).strip()
        for ins, cond in zip(en['Instrument Number'], en['Condition Number'])
    ]
    fr = fr[['french id', 'Location', 'Short Project Name',
             'Theme(s)']].copy().reset_index(drop=True)
    join = en.merge(fr,
                    how='inner',
                    left_on='english id',
                    right_on='french id',
                    suffixes=('_en', '_fr'))
    projectReplace = frenchSubsets(join, 'Short Project Name')
    themeReplace = frenchSubsets(join, 'Theme(s)')
    locationReplace = frenchSubsets(join, 'Location')
    df['Location'] = df['Location'].replace(locationReplace)
    df['Theme(s)'] = df['Theme(s)'].replace(themeReplace)
    df['Short Project Name'] = df['Short Project Name'].replace(projectReplace)
    return df

示例#2

0

显示文件

def get_data(test, sql=False, query='throughput_gas_monthly.sql'):

    csvName = query.split(".")[0]+'.csv'
    if sql:
        print('reading sql '+query.split(".")[0])
        df = execute_sql(path=os.path.join(script_dir, "/queries"), query_name=query, db='EnergyData')
        df.to_csv('raw_data/'+csvName, index=False)
    elif test:
        print('reading test '+query.split(".")[0])
        df = pd.read_csv('raw_data/test_data/'+csvName)
    else:
        print('reading local '+query.split(".")[0])
        df = pd.read_csv('raw_data/'+csvName, encoding='latin-1')

    # inital processing for key points
    if query == 'key_points.sql':
        # add extra key points that dont appear in database
        new = range(5)
        others = pd.DataFrame.from_dict({"Key Point": ["Calgary", "Edmonton", "Saturn", "OSDA Kirby", "OSDA Liege"],
                                         "Corporate Entity": ["NOVA Gas Transmission Ltd." for x in new],
                                         "Latitude": [51.22022, 51.80478, 55.99558, 53.31907, 56.9473],
                                         "Longitude": [-114.4925, -113.59329, -121.1104, -111.35386, -111.80979]})
        df = fixKeyPoint(df)
        df = df.append(others, ignore_index=True)
        df = normalize_text(df, ['Key Point', 'Corporate Entity'])
        df = normalize_numeric(df, ['Latitude', 'Longitude'], 3)
        df = fixCorporateEntity(df)
        df = df[df['Key Point'] != "FortisBC Lower Mainland"]

    return df

示例#3

0

显示文件

def modify_validation_questions(qa_file, synonyms_file, outfile):
    synonyms = Synonyms(synonyms_file)

    qas = read_qa_json(qa_file, split='val')

    modified_texts_counter = 0

    print("Modified questions:")
    for k, question in enumerate(qas):
        question_text = util.normalize_text(question.question)
        modified_question, was_modified = synonyms.replace(
            " ".join(question_text))
        # if modified_question != question:
        if was_modified:
            modified_texts_counter += 1
            print("%s: %s --> %s" %
                  (question.qid, " ".join(question_text), modified_question))
            qas[k] = qas[k]._replace(question=modified_question)

        # convert namedtuple to dict to dump as json
        qas[k] = qas[k]._asdict()

    print("Modified %d questions, %.2f%% of the dataset" %
          (modified_texts_counter,
           float(modified_texts_counter) / len(qas) * 100))

    json_list = json.dumps(qas, indent=4)
    with open(outfile, "w") as of:
        of.write(json_list)

示例#4

0

显示文件

文件： voice.py 项目： wowowos/deepstory

    def synthesize(self, text, timeout=10000):
        with torch.no_grad():  # no grad to save memory
            normalized_text = normalize_text(
                text) + "E"  # text normalization, E: EOS
            L = torch.from_numpy(
                np.array([[hp.char2idx[char] for char in normalized_text]],
                         np.long)).to(device)
            zeros = torch.from_numpy(np.zeros((1, hp.n_mels, 1),
                                              np.float32)).to(device)
            Y = zeros

            for i in range(timeout):
                _, Y_t, A = self.text2mel(L, Y, monotonic_attention=True)
                Y = torch.cat((zeros, Y_t), -1)
                _, attention = torch.max(A[0, :, -1], 0)
                attention = attention.item()
                if L[0, attention] == hp.vocab.index('E'):  # EOS
                    break

            _, Z = self.ssrn(Y)  # batch ssrn instead?
            Z = Z.cpu().detach().numpy()

        wav = spectrogram2wav(Z[0, :, :].T)
        wav = normalize_audio(wav)
        return wav

示例#5

0

显示文件

def test_normalize_text():
    assert util.normalize_text('  this is a test') == 'this is a test'
    assert util.normalize_text('this is a test  ') == 'this is a test'
    assert util.normalize_text('this\r\nis a test') == 'this\\nis a test'
    assert util.normalize_text('this\ris a test') == 'this\\nis a test'
    assert util.normalize_text('this\\nis a test') == 'this\\nis a test'
    assert util.normalize_text(['this is a test']) == ['this is a test']
    assert util.normalize_text({'this': 'is a test'}) == {'this': 'is a test'}

示例#6

0

显示文件

文件： tests_util.py 项目： gjost/ddr-cmdln-bkup

def test_normalize_text():
    assert util.normalize_text('  this is a test') == 'this is a test'
    assert util.normalize_text('this is a test  ') == 'this is a test'
    assert util.normalize_text('this\r\nis a test') == 'this\\nis a test'
    assert util.normalize_text('this\ris a test') == 'this\\nis a test'
    assert util.normalize_text('this\\nis a test') == 'this\\nis a test'
    assert util.normalize_text(['this is a test']) == ['this is a test']
    assert util.normalize_text({'this': 'is a test'}) == {'this': 'is a test'}

示例#7

0

显示文件

 def transform_to_dataset(self, x_set, y_set):
     X, y = [], []
     for document, topic in zip(list(x_set), list(y_set)):
         document = normalize_text(document)
         X.append(document.strip())
         y.append(topic)
         # Augmentation bằng cách remove dấu tiếng Việt
         X.append(no_marks(document))
         y.append(topic)
     return X, y

示例#8

0

显示文件

文件： parser.py 项目： marcuxyz/tibia-crawler

    def extract_deaths(self, html):
        text = html.find("b", string="Character Deaths")
        if text:
            result = []
            rows = text.find_all_next("tr")

            for item in rows:
                if item.text == "Account Information":
                    break

                timestamp = normalize_text(
                    item.select_one("td:nth-of-type(1)").text.strip()
                )
                description = normalize_text(
                    item.select_one("td:nth-of-type(2)").text.strip()
                )

                result.append({"timestamp": timestamp, "description": description})

            return result

示例#9

0

显示文件

    def _parse_from_line(self, splitted, column_map):
        cm = column_map

        self.id = splitted[cm['header']]
        self.name = add_nbsp(splitted[cm['name']])
        self.state = splitted[cm['state']]
        self.garant = add_nbsp(splitted[cm['garant']])
        self.head = splitted[cm['head']]
        self.contact = splitted[cm['contact']]
        self.fields = splitted[cm['fields']].split(',')
        self.fields = list(
            map(lambda s: normalize_text(s.strip()), self.fields))
        self.annotation = add_nbsp(splitted[cm['annotation']])

示例#10

0

显示文件

文件： apportionment.py 项目： mbradds/pipeline-profiles

def process_apportionment(save=False, sql=False, companies=False):

    if sql:
        df = get_data(os.getcwd(), "apportionment.sql", "PipelineInformation",
                      sql)
    else:
        print('reading local apportionment csv...')
        df = pd.read_csv("./raw_data/apportionment.csv")

    df = normalize_dates(df, ['Date'])
    df = normalize_text(df, ['Pipeline Name'])
    # enbridge processing
    df = df.drop(df[(df['Pipeline Name'] == 'EnbridgeMainline')
                    & (df['KeyPointID'].isin(['KP0016', 'KP0021']))].index)
    df = df.drop(df[(df['Pipeline Name'] == 'EnbridgeMainline')
                    & (df['Date'].dt.year < 2016)].index)
    # cochin processing
    df = df.drop(df[(df['Pipeline Name'] == 'Cochin')
                    & (df['KeyPointID'] != 'KP0018')].index)
    df = df[~df['Pipeline Name'].
            isin(["SouthernLights", "Westpur", "TransNorthern"])].reset_index(
                drop=True)

    df = df.rename(columns={x: x.split("(")[0].strip() for x in df.columns})
    num_cols = [
        'Available Capacity', 'Original Nominations', 'Accepted Nominations',
        'Apportionment Percentage'
    ]
    df = normalize_numeric(df, num_cols, 2)
    df = conversion(df, "oil", num_cols[:-1], 2, False)
    df['Apportionment Percentage'] = df['Apportionment Percentage'].round(2)
    company_files = get_company_list("all")

    if companies:
        company_files = companies

    enbridge_points = get_enbridge_points(sql)
    df = sort_by_points(df)

    for company in company_files:
        try:
            this_company_data = process_company(df, company, enbridge_points,
                                                save)
            print("completed: " + company)
        except:
            print("apportionment error: " + company)
            raise

    return this_company_data

示例#11

0

显示文件

    def _parse_from_line(self, splitted, column_map):
        cm = column_map

        self.id = splitted[cm['header']]
        self.short_name = splitted[cm['short-name']]
        self.full_name = add_nbsp(splitted[cm['full-name']])
        self.orgs = parse_orgs(splitted[cm['orgs']])
        self.fields = splitted[cm['fields']].split(',')
        self.fields = list(
            map(lambda s: normalize_text(s.strip()), self.fields))
        self.type = splitted[cm['type']]
        self.date = splitted[cm['date']]
        self.target = add_nbsp(splitted[cm['target']])
        self.link = splitted[cm['link']]
        self.price = add_nbsp(splitted[cm['price']])
        self.place = add_nbsp(splitted[cm['place']])
        self.contact = splitted[cm['contact']]
        self.highlighted = (splitted[cm['highlighted']].lower() == 'ano')
        self.annotation = add_nbsp(splitted[cm['annotation']])

示例#12

0

显示文件

文件： dataset.py 项目： mircean/ML

def text2words(text):
    #this works better for imdb. not sure why quotes are not handled inconsistently by spacy
    text = text.replace('"', ' ')
  
    doc = nlp(text)
    
    words = [w for w in doc if not w.is_space and not w.is_bracket and not w.is_punct]
    if opt['text_pp_remove_stop_words']:
        words = [w for w in words if not w.is_stop ]
        
    words = [w for w in words if not ignore_word(w)]
    
    if opt['text_pp_lemmatization']:
        words = [w.lemma_ if w.lemma_ != '-PRON-' else w.lower_ for w in words]
    else:
        words = [w.lower_ for w in words]
        
    words = [util.normalize_text(w) for w in words]
    return words

示例#13

0

显示文件

def get_traffic_data(sql=False,
                     query='throughput_gas_monthly.sql',
                     db="PipelineInformation"):

    csv_name = query.split(".")[0] + '.csv'
    if sql:
        print('reading sql ' + query.split(".")[0])
        df = execute_sql(path=os.path.join(os.getcwd(), "queries"),
                         query_name=query,
                         db=db)
        df.to_csv('raw_data/' + csv_name, index=False)

    else:
        print('reading local ' + query.split(".")[0])
        df = pd.read_csv('raw_data/' + csv_name, encoding='utf-8')

    # inital processing for key points
    if query == 'key_points.sql':
        df = normalize_text(df, ['Key Point', 'Pipeline Name'])
        df = normalize_numeric(df, ['Latitude', 'Longitude'], 3)

    return df

示例#14

0

显示文件

文件： apportionment.py 项目： jrodioukova/pipeline-profiles

def process_apportionment(test=False, sql=False, companies=False):

    if sql:
        df = get_data(False, True, "apportionment.sql")
    elif test:
        print('no tests for apportionment data!')
    else:
        print('reading local apportionment csv...')
        df = pd.read_csv("./raw_data/apportionment.csv")

    df = normalize_dates(df, ['Date'])
    df = normalize_text(df, ['Corporate Entity'])
    # enbridge processing
    df = df.drop(df[(df['Corporate Entity'] == 'Enbridge Pipelines Inc.') & (df['Key Point'] != 'system')].index)
    df = df.drop(df[(df['Corporate Entity'] == 'Enbridge Pipelines Inc.') & (df['Date'].dt.year < 2016)].index)
    # cochin processing
    df = df.drop(df[(df['Corporate Entity'] == 'PKM Cochin ULC') & (df['Key Point'] != 'Ft. Saskatchewan')].index)
    df = df[~df['Pipeline Name'].isin(["Southern Lights Pipeline",
                                       "Westpur Pipeline",
                                       "Trans-Northern"])].reset_index(drop=True)

    df['Key Point'] = df['Key Point'].replace("All", "system")
    df = addIds(df)
    del df['Pipeline Name']
    df = df.rename(columns={x: x.split("(")[0].strip() for x in df.columns})
    numCols = ['Available Capacity', 'Original Nominations', 'Accepted Nominations', 'Apportionment Percentage']
    df = normalize_numeric(df, numCols, 2)
    df = conversion(df, "oil", numCols[:-1], 2, False)

    df['Apportionment Percentage'] = df['Apportionment Percentage'].round(2)

    company_files = ['NOVA Gas Transmission Ltd.',
                     'Westcoast Energy Inc.',
                     'TransCanada PipeLines Limited',
                     'Alliance Pipeline Ltd.',
                     'Trans Quebec and Maritimes Pipeline Inc.',
                     'Maritimes & Northeast Pipeline Management Ltd.',
                     'Many Islands Pipe Lines (Canada) Limited',
                     'Emera Brunswick Pipeline Company Ltd.',
                     'Foothills Pipe Lines Ltd.',
                     'Enbridge Pipelines Inc.',
                     'TransCanada Keystone Pipeline GP Ltd.',
                     'Trans Mountain Pipeline ULC',
                     'PKM Cochin ULC',
                     'Trans-Northern Pipelines Inc.',
                     'Enbridge Pipelines (NW) Inc.',
                     'Enbridge Southern Lights GP Inc.',
                     'Kingston Midstream Westspur Limited',
                     'Vector Pipeline Limited Partnership',
                     'Many Islands Pipe Lines (Canada) Limited',
                     'Plains Midstream Canada ULC',
                     'Enbridge Bakken Pipeline Company Inc.',
                     'Express Pipeline Ltd.',
                     'Genesis Pipeline Canada Ltd.',
                     'Montreal Pipe Line Limited',
                     'Aurora Pipeline Company Ltd',
                     'Kingston Midstream Westspur Limited',
                     'Enbridge Southern Lights GP Inc.',
                     'Emera Brunswick Pipeline Company Ltd.']

    # for company in ['Enbridge Pipelines (NW) Inc.']:
    for company in company_files:
        thisCompanyData = {}
        folder_name = company.replace(' ', '').replace('.', '')
        df_c = df[df['Corporate Entity'] == company].copy().reset_index(drop=True)
        if not df_c.empty:
            thisCompanyData['build'] = True
            df_c = df_c.drop_duplicates(subset=['Date'])
            df_c = df_c.sort_values(by='Date')
            minDate = min(df_c['Date']) - dateutil.relativedelta.relativedelta(months=1)
            thisCompanyData["keyPoint"] = list(df_c['Key Point'])[0]
            thisCompanyData["company"] = company
            hasCap = hasData(df_c, "Available Capacity")
            hasOrigNom = hasData(df_c, "Original Nominations")
            hasAccepNom = hasData(df_c, "Accepted Nominations")
            hasPct = hasNotNull(df_c, "Apportionment Percentage")
            lineData, areaData, pctData = [], [], []
            series = []
            series.append({"name": "date",
                           "min": [minDate.year, minDate.month-1, minDate.day]})
            for cap, oNom, aNom, aPct in zip(df_c['Available Capacity'],
                                             df_c['Original Nominations'],
                                             df_c['Accepted Nominations'],
                                             df_c['Apportionment Percentage']):

                if hasCap and hasOrigNom:
                    linePoint = cap
                    areaPoint = oNom
                    areaName = "on"  # Original Nominations
                    lineName = "ac"  # Available Capacity
                elif hasOrigNom and hasAccepNom:
                    linePoint = aNom
                    areaPoint = oNom
                    areaName = "on"
                    lineName = "an"
                else:
                    raise ApportionSeriesCombinationError(company)

                pctData.append(aPct)
                lineData.append(linePoint)
                areaData.append(areaPoint)

            series.append({"id": lineName,
                           "data": lineData,
                           "yAxis": 0,
                           "type": "line"})
            series.append({"id": areaName,
                           "data": areaData,
                           "yAxis": 0,
                           "type": "area"})
            if hasPct:
                series.append({"id": "ap",  # Apportionment Percent
                               "data": pctData,
                               "yAxis": 1,
                               "type": "line",
                               "visible": False,
                               # "showInLegend": False
                               })

            thisCompanyData["series"] = series

        else:
            thisCompanyData["build"] = False

        if not test:
            with open('../apportionment/company_data/'+folder_name+'.json', 'w') as fp:
                json.dump(thisCompanyData, fp, default=str)

    return df

示例#15

0

显示文件

文件： deepstory.py 项目： AvatarWorld/deepstory

    def parse_text(self,
                   text,
                   default_speaker,
                   separate_comma=False,
                   n_gram=2,
                   separate_sentence=False,
                   parse_speaker=True,
                   normalize=True):
        """
        Parse the input text into suitable data structure
        :param n_gram: concat sentences of this max length in a line
        :param text: source
        :param default_speaker: the default speaker if no speaker in specified
        :param separate_comma: split by comma
        :param separate_sentence: split sentence if multiple clauses exist
        :param parse_speaker: bool for turn on/off parse speaker
        :param normalize: to convert common punctuation besides comma to comma
        """

        lines = re.split(r'\r\n|\n\r|\r|\n', text)

        line_speaker_dict = {}
        # TODO: allow speakers not in model_list and later are forced to be replaced
        if parse_speaker:
            # re.match(r'^.*(?=:)', text)
            for i, line in enumerate(lines):
                if re.search(r':|\|', line):
                    # ?: non capture group of : and |
                    speaker, line = re.split(r'\s*(?::|\|)\s*', line, 1)
                    # add entry only if the voice model exist in the folder,
                    # the unrecognized one will be changed to default in later code
                    if speaker in self.model_list:
                        line_speaker_dict[i] = speaker
                    lines[i] = line

        if normalize:
            lines = [normalize_text(line) for line in lines]

        # separate by spacy sentencizer
        lines = [
            separate(line, n_gram, comma=separate_comma) for line in lines
        ]

        sentence_dicts = []
        for i, line in enumerate(lines):
            for j, sent in enumerate(line):
                if sentence_dicts:
                    if sent[0].is_punct and not any(sent[0].text == punct
                                                    for punct in ['“', '‘']):
                        sentence_dicts[-1][
                            'punct'] = sentence_dicts[-1]['punct'] + sent.text
                        continue
                sentence_dict = {
                    'text':
                    sent.text,
                    'begin':
                    True if j == 0 else False,
                    'punct':
                    '',
                    'speaker':
                    line_speaker_dict.get(i, self.model_list[default_speaker])
                }

                while not sentence_dict['text'][-1].isalpha():
                    sentence_dict['punct'] = sentence_dict[
                        'punct'] + sentence_dict['text'][-1]
                    sentence_dict['text'] = sentence_dict['text'][:-1]
                # Reverse the punctuation order since I add it based on the last item
                sentence_dict['punct'] = sentence_dict['punct'][::-1]
                sentence_dict[
                    'text'] = sentence_dict['text'] + sentence_dict['punct']
                sentence_dicts.append(sentence_dict)

        speaker_dict = {}
        for i, sentence_dict in enumerate(sentence_dicts):
            if sentence_dict['speaker'] not in speaker_dict:
                speaker_dict[sentence_dict['speaker']] = []
            speaker_dict[sentence_dict['speaker']].append(i)
        self.speaker_dict = speaker_dict
        self.sentence_dicts = sentence_dicts

示例#16

0

显示文件

文件： conditions.py 项目： mbradds/pipeline-profiles

def process_conditions(remote=False,
                       sql=False,
                       non_standard=True,
                       company_names=False,
                       companies=False,
                       test=False,
                       save=True):
    if remote:
        print('downloading remote conditions file')
        link = 'http://www.cer-rec.gc.ca/open/conditions/conditions.csv'
        df = pd.read_csv(
            link,
            # sep='\t',
            # lineterminator='\r',
            encoding="latin-1",
            error_bad_lines=True)
        df = normalize_text(df, ['Location', 'Short Project Name', 'Theme(s)'])

    elif test:
        print('reading test conditions data')
        df = pd.read_csv('./raw_data/test_data/conditions.csv',
                         encoding="UTF-16",
                         sep='\t')
        df = normalize_text(df, ['Location', 'Short Project Name', 'Theme(s)'])
    else:
        print('reading local conditions data')
        df = pd.read_csv('./raw_data/conditions_en.csv',
                         encoding="UTF-16",
                         sep='\t')
        df = normalize_text(df, ['Location', 'Short Project Name', 'Theme(s)'])

    for date_col in ['Effective Date', 'Issuance Date', 'Sunset Date']:
        df[date_col] = pd.to_datetime(df[date_col])

    if not non_standard:
        # only include non-standard conditions
        df = df[df['Condition Type'] != 'Standard']

    delete_cols = [
        'Condition', 'Condition Phase', 'Instrument Activity',
        'Condition Type', 'Condition Filing'
    ]

    for delete in delete_cols:
        del df[delete]

    for r in ['\n', '"']:
        df['Company'] = df['Company'].replace(r, '', regex=True)
    df['Company'] = [x.strip() for x in df['Company']]
    df['Condition Status'] = df['Condition Status'].astype('object')
    df['Condition Status'] = [str(x).strip() for x in df['Condition Status']]
    # preliminary processing
    df['Company'] = df['Company'].replace(company_rename())
    df = apply_system_id(df, "Company")

    df = df[df['Short Project Name'] != "SAM/COM"].copy().reset_index(
        drop=True)

    df = add_links(df, sql)
    if company_names:
        print(get_company_names(df['Company']))

    df, region_replace, project_names = idify_conditions(df, sql)
    regions_map = import_simplified(region_replace)

    if companies:
        company_files = companies
    else:
        company_files = get_company_list("all")

    for company in company_files:
        try:
            df_c, shp, dfmeta, meta = process_company(df, company,
                                                      project_names,
                                                      regions_map, test, save)
            print("completed: " + company)
        except:
            print("conditions error: " + company)
            raise

    return df_c, shp, dfmeta, meta

示例#17

0

显示文件

def process_tolls_data(sql=True, companies=False, save=True, completed=[]):

    def generate_path_series(df, paths, series_col):
        path_series = []
        for path in paths:
            df_p = df[df["Path"] == path].copy().reset_index(drop=True)
            if not df_p.empty:
                path_series.append({"pathName": path,
                                   "series": process_path(df_p, series_col)})
        return path_series

    def find_series_col(df, company):
        products = sorted(list(set(df["Product"])))
        services = sorted(list(set(df["Service"])))
        units = list(set(df["Original Toll Unit"]))
        if len(products) > 1:
            product_filter = list(set(df["Product"]))
            product_filter = [[x, True] if x == "heavy crude" else [x, False] for x in product_filter]
        else:
            product_filter = False
        
        if len(units) > 1:
            series_col = "Units"
            print("Multiple units for: "+company)
        elif len(products) > 1 and len(services) <= 1 :
            series_col = "Product"
        elif len(services) > 1 and len(products) <= 1:
            series_col = "Service"
        elif len(services) <= 1 and len(products) <= 1:
            series_col = "Path"
        elif len(products) > 1 and len(services) > 1:
            series_col = "Service"
        else:
            series_col = "Service"
            print("error! Need to filter on two columns")
        # override series col if needed
        if company in ["Westcoast", "Keystone"]:
            series_col = "Path"

        return series_col, product_filter

    df, descriptions, toll_nums = get_tolls_data(sql)
    toll_nums = normalize_dates(toll_nums, ["s", "e"])
    df = normalize_text(df, ['Product', 'Path', 'Service', 'Original Toll Unit', 'Converted Toll Unit'])
    df = normalize_dates(df, ["Effective Start", "Effective End"])
    df = df[~df["Effective Start"].isnull()].copy().reset_index(drop=True)

    company_files = get_company_list()
    process_description(descriptions, save)

    if companies:
        company_files = companies
    for company in company_files:
        # print(company)
        this_company_data = {}
        if company == "EnbridgeMainline":
            df_c = df[df["PipelineID"].isin(["EnbridgeMainline", "EnbridgeFSP", "EnbridgeLocal"])].copy().reset_index(drop=True)
        else:
            df_c = df[df["PipelineID"] == company].copy().reset_index(drop=True)

        df_c, selected_paths, selectedService, path_filter, split_default, path_totals, decimals = company_filter(df_c, company)
        meta = {"companyName": company}
        if not df_c.empty and company in completed:
            meta["build"] = True
            meta["pathTotals"] = path_totals
            meta["decimals"] = decimals
            paths = sorted(list(set(df_c["Path"])))
            services = sorted(list(set(df_c["Service"])))
            units = list(set(df_c["Original Toll Unit"]))
            meta["pathFilter"] = path_filter
            meta["split"] = {"default": split_default}
            if split_default:
                meta["split"]["buttons"] = list(set(df_c["split"]))
                path_series = {}
                meta["paths"], meta["seriesCol"], meta["products"], meta["services"], meta["units"], meta["tollNum"], meta["unitsFilter"] = {}, {}, {}, {}, {}, {}, {}
                if company == "EnbridgeMainline":
                    meta["splitDescription"] = {}
                else:
                    meta["splitDescription"] = False

                for split in list(set(df_c["split"])):
                    df_split = df_c[df_c["split"] == split].copy().reset_index(drop=True)
                    # add toll numbers
                    this_nums = toll_nums[toll_nums["PipelineID"] == list(df_split["PipelineID"])[0]].copy()
                    del this_nums["PipelineID"]
                    meta["tollNum"][split] = this_nums.to_dict(orient="records")
                    # add enbridge descriptions
                    if meta["splitDescription"] != False and split != "Enbridge Mainline":
                        current_definition = descriptions[descriptions["PipelineID"] ==list(df_split["PipelineID"])[0]]
                        meta["splitDescription"][split] = list(current_definition["Toll Description"])[0]


                    paths = sorted(list(set(df_split["Path"])))
                    services = sorted(list(set(df_split["Service"])))
                    units = list(set(df_split["Original Toll Unit"]))
                    series_col, product_filter = find_series_col(df_split, company)
                    if len(selected_paths) > 0:
                        meta["paths"][split] = [[p, True] if p in selected_paths[split] else [p, False] for p in paths]
                    else:
                        meta["paths"][split] = [[p, True] for p in paths]
                    meta["products"][split] = product_filter
                    meta["seriesCol"][split] = series_col
                    meta["unitsFilter"][split] = units_filter(df_split)
                    if selectedService:
                        meta["services"][split] = [[s, True] if s == selectedService[split] else [s, False] for s in services]
                    else:
                        meta["services"][split] = selectedService
                    meta["units"][split] = units
                    path_series[split] = generate_path_series(df_split, paths, series_col)
            else:
                # add toll numbers
                this_nums = toll_nums[toll_nums["PipelineID"] == company].copy()
                del this_nums["PipelineID"]
                meta["tollNum"] = this_nums.to_dict(orient="records")
                series_col, product_filter = find_series_col(df_c, company)
                meta["products"] = product_filter
                meta["seriesCol"] = series_col
                meta["paths"] = [[p, True] if p in selected_paths else [p, False] for p in paths]
                meta["services"] = [[s, True] if s == selectedService else [s, False] for s in services]
                meta["units"] = units
                meta["unitsFilter"] = units_filter(df_c)
                path_series = generate_path_series(df_c, paths, series_col)

            this_company_data["meta"] = meta
            this_company_data["tolls"] = path_series
        else:
            meta["build"] = False
            this_company_data["meta"] = meta

        if save:
            with open('../data_output/tolls/'+company+'.json', 'w') as fp:
                json.dump(this_company_data, fp, default=str)

    return df_c, this_company_data

示例#18

0

显示文件

def process_remediation(sql=False,
                        remote=True,
                        companies=False,
                        test=False,
                        save=True):

    if test:
        print("reading test remediation test data")
        df = pd.read_csv(
            os.path.join(os.getcwd(), "raw_data", "test_data",
                         "remediation.csv"))
    elif remote:
        print("reading remote remediation file")
        df = pd.read_csv(
            "https://www.cer-rec.gc.ca/open/compliance/contamination.csv",
            encoding="latin-1",
            engine="python",
        )
        df.to_csv("./raw_data/remediation.csv")
    else:
        print("reading local remediation file")
        df = pd.read_csv("./raw_data/remediation.csv")

    contaminants = get_data(sql=sql,
                            script_loc=os.getcwd(),
                            query="remediationContaminants.sql",
                            db="dsql22cap")
    old = get_data(sql=sql,
                   script_loc=os.getcwd(),
                   query="remediation_pre_2018.sql",
                   db="dsql22cap")

    df = apply_contaminant_ids(df, contaminants)
    df["Contaminants at the Site"] = [["18"] if x == None else x
                                      for x in df["Contaminants at the Site"]]
    df["Site Within 30 Meters Of Waterbody"] = [
        True if x == "Yes" else False
        for x in df["Site Within 30 Meters Of Waterbody"]
    ]
    df = normalize_text(df, [
        'Applicable Land Use', 'Site Status', 'Activity At Time Of Discovery',
        'Pipeline Name', 'Facility Name'
    ])

    pipe_section = []
    na = "Not Specified"
    for pipe, section in zip(df['Pipeline Name'], df['Facility Name']):
        if pipe == na and section == na:
            pipe_section.append("ns")  # Not Specified
        elif pipe == na and section != na:
            pipe_section.append("f")  # Facility
        elif pipe != na and section == na:
            pipe_section.append("p")  # Pipeline
        elif pipe != na and section != na:
            pipe_section.append("pf")  # Pipeline and Facility
        else:
            print("error here!")

    df["ps"] = pipe_section
    del df['Pipeline Name']
    del df['Facility Name']

    # add id's
    land_use_ids = {
        "developed land - industrial": "dli",
        "developed land - small commercial": "dls",
        "developed land - residential": "dlr",
        "barren land": "bl",
        "shrub land": "sl",
        "vegetative barren": "vb",
        "forests": "f",
        "Agricultural Cropland": "ac",
        "water / wetlands": "w",
        "Tundra / Native Prairie / Parks": "t",
        "agricultural land": "al",
        "protected area": "pa",
        "non-developed land": "ndl"
    }

    status_ids = {
        "monitored": "m",
        "post-remediation monitoring": "prm",
        "facility monitoring": "fm",
        "ongoing remediation": "or",
        "site assessment": "sa",
        "risk managed": "rm"
    }

    activity_ids = {
        "maintenance": "m",
        "operation": "o",
        "construction": "c",
        "abandonment": "a"
    }

    df = idify(df, "Applicable Land Use", land_use_ids)
    df = idify(df, "Province", "region")
    df = idify(df, "Site Status", status_ids)
    df = idify(df, "Activity At Time Of Discovery", activity_ids)

    df['Final Submission Date'] = pd.to_datetime(df['Final Submission Date'])
    df['y'] = df['Final Submission Date'].dt.year

    df = df.fillna(value=np.nan)
    for ns in [
            'Applicable Land Use', 'Activity At Time Of Discovery',
            'Contaminants at the Site',
            'Initial Estimate of Contaminated Soil (m3)',
            'Site Within 30 Meters Of Waterbody', 'Site Status', 'Latitude',
            'Longitude'
    ]:

        df[ns] = [
            None if x in ["Not Specified", np.nan, "nan"] else x
            for x in df[ns]
        ]

    for numeric in [
            'Initial Estimate of Contaminated Soil (m3)', 'Latitude',
            'Longitude', 'y'
    ]:

        df[numeric] = df[numeric].replace(np.nan, int(-1))

    for int_numeric in ['y', 'Initial Estimate of Contaminated Soil (m3)']:
        df[int_numeric] = df[int_numeric].astype(int)

    df['loc'] = [[lat, long]
                 for lat, long in zip(df['Latitude'], df['Longitude'])]
    del df['Latitude']
    del df['Longitude']
    columns = {
        "Event ID": "id",
        "Site Status": "s",
        "Activity At Time Of Discovery": "a",
        "Province": "p",
        "Applicable Land Use": "use",
        "Contaminants at the Site": "c",
        "Initial Estimate of Contaminated Soil (m3)": "vol",
        "Site Within 30 Meters Of Waterbody": "w"
    }

    df = df.rename(columns=columns)
    for col in df:
        if col not in columns.values() and col not in [
                "Company Name", "Final Submission Date", "y", "ps", "loc"
        ]:
            del df[col]

    df['Company Name'] = df['Company Name'].replace(company_rename())
    df = apply_system_id(df, "Company Name")

    old["Company"] = old["Company"].replace(company_rename())
    old = apply_system_id(old, "Company")

    if companies:
        company_files = companies
    else:
        company_files = get_company_list("all")

    for company in company_files:
        try:
            folder_name = company.replace(' ', '').replace('.', '')
            df_c = df[df['Company Name'] == company].copy().reset_index(
                drop=True)
            this_company_data = {}

            if not df_c.empty:
                this_company_data["meta"] = meta(df_c, company, old)
                this_company_data["build"] = True
                this_company_data["data"] = optimize_json(df_c)
                if save and not test:
                    with open(
                            '../data_output/remediation/' + folder_name +
                            '.json', 'w') as fp:
                        json.dump(this_company_data, fp)
            else:
                this_company_data['data'] = df_c.to_dict(orient='records')
                this_company_data['meta'] = {"companyName": company}
                this_company_data["build"] = False
                if save and not test:
                    with open(
                            '../data_output/remediation/' + folder_name +
                            '.json', 'w') as fp:
                        json.dump(this_company_data, fp)
            print("completed: " + company)
        except:
            print("remediation error: " + company)
            raise

    return df, this_company_data

示例#19

0

显示文件

def process_conditions(remote=False,
                       nonStandard=True,
                       company_names=False,
                       companies=False,
                       test=False,
                       lang='en',
                       save=True):
    if remote:
        print('downloading remote conditions file')
        if lang == 'en':
            link = 'http://www.cer-rec.gc.ca/open/conditions/conditions.csv'
            df = pd.read_csv(link,
                             sep='\t',
                             lineterminator='\r',
                             encoding="UTF-16",
                             error_bad_lines=False)
            df = normalize_text(df,
                                ['Location', 'Short Project Name', 'Theme(s)'])
        else:
            link = 'http://www.cer-rec.gc.ca/open/conditions/conditions.csv'
            linkFR = 'https://www.cer-rec.gc.ca/ouvert/conditions/conditions.csv'
            df = pd.read_csv(link,
                             sep='\t',
                             lineterminator='\r',
                             encoding="UTF-16",
                             error_bad_lines=False)
            fr = pd.read_csv(linkFR,
                             sep='\t',
                             lineterminator='\r',
                             encoding="UTF-16",
                             error_bad_lines=False)
            df = process_french(df, fr)

    elif test:
        print('reading test conditions data')
        df = pd.read_csv('./raw_data/test_data/conditions.csv',
                         encoding="UTF-16",
                         sep='\t')
        df = normalize_text(df, ['Location', 'Short Project Name', 'Theme(s)'])
    else:
        print('reading local conditions data')
        if lang == 'en':
            df = pd.read_csv('./raw_data/conditions_en.csv',
                             encoding="UTF-16",
                             sep='\t')
            df = normalize_text(df,
                                ['Location', 'Short Project Name', 'Theme(s)'])
        else:
            df = pd.read_csv('./raw_data/conditions_en.csv',
                             encoding="UTF-16",
                             sep='\t')
            fr = pd.read_csv('./raw_data/conditions_fr.csv',
                             encoding="UTF-16",
                             sep='\t')
            df = process_french(df, fr)

    for date_col in ['Effective Date', 'Issuance Date', 'Sunset Date']:
        df[date_col] = pd.to_datetime(df[date_col])

    if not nonStandard:
        # only include non-standard conditions
        df = df[df['Condition Type'] != 'Standard']

    delete_cols = [
        'Condition', 'Condition Phase', 'Instrument Activity',
        'Condition Type', 'Condition Filing'
    ]

    for delete in delete_cols:
        del df[delete]

    for r in ['\n', '"']:
        df['Company'] = df['Company'].replace(r, '', regex=True)
    df['Company'] = [x.strip() for x in df['Company']]
    df['Condition Status'] = df['Condition Status'].astype('object')
    df['Condition Status'] = [str(x).strip() for x in df['Condition Status']]
    # preliminary processing
    df['Company'] = df['Company'].replace(company_rename())

    df = df[df['Short Project Name'] != "SAM/COM"].copy().reset_index(
        drop=True)
    df['Theme(s)'] = df['Theme(s)'].replace({"nan": "No theme specified"})

    regions_map = import_simplified()
    df = add_links(df)
    if company_names:
        print(get_company_names(df['Company']))

    if companies:  # used to set one company for testing
        company_files = companies
    else:
        company_files = [
            'NOVA Gas Transmission Ltd.', 'TransCanada PipeLines Limited',
            'Enbridge Pipelines Inc.', 'Enbridge Pipelines (NW) Inc.',
            'Express Pipeline Ltd.', 'Trans Mountain Pipeline ULC',
            'Trans Quebec and Maritimes Pipeline Inc.',
            'Trans-Northern Pipelines Inc.',
            'TransCanada Keystone Pipeline GP Ltd.', 'Westcoast Energy Inc.',
            'Alliance Pipeline Ltd.', 'PKM Cochin ULC',
            'Foothills Pipe Lines Ltd.', 'Southern Lights Pipeline',
            'Emera Brunswick Pipeline Company Ltd.',
            'Many Islands Pipe Lines (Canada) Limited',
            'Maritimes & Northeast Pipeline Management Ltd.',
            'Vector Pipeline Limited Partnership',
            'Plains Midstream Canada ULC',
            'Enbridge Bakken Pipeline Company Inc.',
            'Genesis Pipeline Canada Ltd.', 'Montreal Pipe Line Limited',
            'Kingston Midstream Westspur Limited',
            'Aurora Pipeline Company Ltd'
        ]

    for company in company_files:
        thisCompanyData = {}
        folder_name = company.replace(' ', '').replace('.', '')

        df_c = df[df['Company'] == company].copy().reset_index(drop=True)
        if not df_c.empty:
            # df_c = add_links(df_c, links)
            df_c['condition id'] = [
                str(ins) + '_' + str(cond) for ins, cond in zip(
                    df_c['Instrument Number'], df_c['Condition Number'])
            ]
            expanded_locations = []
            for unique in df_c['condition id']:
                row = df_c[df_c['condition id'] == unique].copy().reset_index(
                    drop=True)
                locations = [x.split(',') for x in row['Location']]
                for region in locations[0]:
                    regionProvince = region.strip().split('/')
                    row['id'] = regionProvince[0].strip()
                    row['Flat Province'] = regionProvince[-1].strip()
                    expanded_locations.append(row.copy())

            df_all = pd.concat(expanded_locations,
                               axis=0,
                               sort=False,
                               ignore_index=True)
            # calculate metadata here
            dfmeta, meta = conditionMetaData(df_all, folder_name)
            meta["build"] = True
            thisCompanyData['meta'] = meta
            shp, mapMeta = conditions_on_map(dfmeta, regions_map, folder_name,
                                             lang)

            thisCompanyData['regions'] = shp.to_json()
            thisCompanyData['mapMeta'] = mapMeta.to_dict(orient='records')
            if not test and save:
                with open(
                        '../conditions/company_data/' + lang + '/' +
                        folder_name + '.json', 'w') as fp:
                    json.dump(thisCompanyData, fp)
                print('completed+saved ' + lang + ' conditions: ' + company)
        else:
            meta = {"companyName": company}
            thisCompanyData = {
                'meta': {
                    "companyName": company,
                    "build": False
                },
                'regions': "{}",
                'mapMeta': []
            }

            if not test and save:
                with open(
                        '../conditions/company_data/' + lang + '/' +
                        folder_name + '.json', 'w') as fp:
                    json.dump(thisCompanyData, fp)
                print('completed+saved ' + lang + ' conditions: ' + company)

        # if not test:
        #     print('completed '+lang+' conditions: '+company)

    return df_c, shp, dfmeta, meta

示例#20

0

显示文件

文件： parser.py 项目： marcuxyz/tibia-crawler

 def extract_last_login(self, html):
     result = html.find("td", string="Last Login:"******"td").text)

示例#21

0

显示文件

文件： process_reservations.py 项目： msom/crawler.rueti

    if cell.ctype == 4:
        # bool
        return bool(cell.value)
    assert False


bookings = []
headers = sheet.row(0)
for row_number in range(sheet.nrows)[1:]:
    row = sheet.row(row_number)
    data = dict(
        (headers[index].value, convert_cell_value(value))
        for index, value in enumerate(row)
    )

    if data['hour_start'].date() < today and data['hour_end'].date() < today:
        continue

    assert data['room_id'] in rooms.keys()

    if not data['anspmail']:
        data['anspmail'] = util.normalize_text(data['title']) + '@example.org'
        print data['anspmail']
    assert data['anspmail']

    bookings.append(data)

# todo:
# for booking in bookings:
#     util.reserve_resource(...)

示例#22

0

显示文件

文件： main.py 项目： kenta1984/sif

    mt = MeCab.Tagger(
        '-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd/ -Owakati')  # 適宜変更
    wv = KeyedVectors.load_word2vec_format('./vecs/wiki.vec.pt', binary=True)

    # Load the sif data
    w_by_kw = defaultdict(float)
    with open('./vecs/sif.tsv', 'r') as f:
        reader = csv.reader(f, delimiter='\t')
        for row in reader:
            w_by_kw[row[0]] = float(row[1])

    # Make sentence vectors
    sents1 = ['レストランでカレーを食べた。', '安倍首相が記者会見をした。']
    sents2 = ['定食屋でハンバーグを注文した。', '安倍さんがインタビューに答えた。']
    sents1_vec = [
        get_vector_from_text(normalize_text(sent), mt, wv, w_by_kw)
        for sent in sents1
    ]
    sents2_vec = [
        get_vector_from_text(normalize_text(sent), mt, wv, w_by_kw)
        for sent in sents2
    ]

    # Show the resutls
    sim_matrix = cosine_similarity(sents1_vec, sents2_vec)
    for i, sims in enumerate(sim_matrix):
        for j, sim in enumerate(sims):
            print(sents1[i])
            print(sents2[j])
            print(sim)
            print()