示例#1
0
def domain_action(namespace):
    headers, position, reader = custom_reader(namespace.file, namespace.column)

    headers.append(namespace.column + "_domain")
    writer = csv.writer(namespace.output)
    writer.writerow(headers)

    for line in reader:
        url = line[position]
        line.append(get_domain_name(url))
        writer.writerow(line)
def clean_data(CLEAN_DATA_DIRECTORY, SCIENTIFIC_TOPIC, DATE):
    """Import and prepare the dataframe to be used to build the graphs"""

    posts_path = os.path.join(".", CLEAN_DATA_DIRECTORY, 
                              "fake_posts_" + SCIENTIFIC_TOPIC + "_" + DATE + ".csv")
    posts_df = pd.read_csv(posts_path)

    if DATE == "28_04_2020":        
        # Remove the url with parameters from the analysis because CT return wrong results for them:
        posts_df['parameter_in_url'] = posts_df['url'].apply(lambda x: '?' in x)
        posts_df = posts_df[posts_df['parameter_in_url']==False]

    posts_df = posts_df[posts_df["platform"] == "Facebook"]
    posts_df = posts_df.dropna(subset=['account_id', 'url'])
    posts_df['account_id'] = posts_df['account_id'].apply(lambda x:int(x))
    
    # Sometimes a same facebook group can share multiple times the same URL, 
    # creating multiple lines in the input CSV. We remove the duplicates here:
    posts_df = posts_df[['url', 'account_name', 'account_id',
                         'account_subscriber_count', 'actual_like_count']]
    posts_df = posts_df.drop_duplicates(subset=['url', 'account_id'], keep='last')

    posts_df['domain_name'] = posts_df['url'].apply(lambda x: ural.get_domain_name(x))

    if DATE == "28_04_2020":
        # Remove the platforms from the analysis:
        platforms = ["facebook.com", "youtube.com", "twitter.com", "wordpress.com", "instagram.com"]
        posts_df = posts_df[~posts_df['domain_name'].isin(platforms)]

    # We remove the facebook groups that have shared only one fake URL:
    vc = posts_df['account_id'].value_counts()
    posts_df = posts_df[posts_df['account_id'].isin(vc[vc > 1].index)]

    # We prepare a dataframe to import the facebook group nodes with specific attributes:
    # - the number of followers
    # - the account name -> label
    # - the fake news URL shared by this group -> node size
    fb_group_df = posts_df[['account_id', 'account_name', 'account_subscriber_count']]\
                             .sort_values(by="account_subscriber_count", ascending=True)\
                             .drop_duplicates(subset = ['account_id'], keep='last')

    temp = posts_df.groupby('account_id')['url'].apply(list)\
                .to_frame().reset_index()
    fb_group_df = fb_group_df.merge(temp, left_on='account_id', right_on='account_id', how='left')
    fb_group_df['nb_fake_news_shared'] = fb_group_df['url'].apply(lambda x:len(x))

    # We prepare a dataframe to import the facebook group nodes with specific attributes:
    # - the fake news URL shared by this domain -> node size
    domain_df = posts_df[['url', 'domain_name']].drop_duplicates()\
                    .groupby('domain_name')['url'].apply(list)\
                    .to_frame().reset_index()
    domain_df['nb_fake_news_shared'] = domain_df['url'].apply(lambda x:len(x))
    
    return posts_df, fb_group_df, domain_df
示例#3
0
def extract_standard_addendum(cli_args, url):
    inferred_redirection = infer_redirection(url)

    return [
        normalize_url(url,
                      strip_protocol=cli_args.strip_protocol,
                      strip_trailing_slash=True),
        inferred_redirection if inferred_redirection != url else '',
        get_domain_name(url),
        get_hostname(url),
        get_normalized_hostname(url), 'yes' if is_shortened_url(url) else ''
    ]
示例#4
0
def clean_data(url_df, fact_check_df, SCIENTIFIC_TOPIC):
    """Clean and merge the appearance data"""

    # Remove the spaces added by error arount the URLs
    url_df['url'] = url_df['url'].transform(lambda x: x.strip())

    # Filter the URLs to keep only the ones flagged as False or equivalent:
    url_df = url_df[(url_df['Flag as'].isin(
        ['False', 'Partly false', 'Misleading', 'False headline']))]

    # Use a REGEX to get the article field from the fact-check url website:
    # if the fact-check url starts with 'https://climatefeedback.org' -> 'climate' article
    # if the fact-check url starts with 'https://healthfeedback.org'  -> 'health' article
    fact_check_df['field'] = fact_check_df['Review url'].str.extract(
        'https://([^/]+)feedback.org')

    # Merge the two dataframes to get the 'field' for each url:
    url_df = url_df.dropna(subset=['Item reviewed'])
    fact_check_df = fact_check_df.dropna(subset=['Items reviewed'])
    url_df = url_df.merge(fact_check_df[['Items reviewed', 'field', 'topic']],
                          left_on='Item reviewed',
                          right_on='Items reviewed',
                          how='left')

    # Keep only the URL about the scientific topic of interest:
    url_df.loc[url_df['topic'] == 'COVID-19', 'field'] = 'COVID-19'
    url_df = url_df.dropna(subset=['field'])
    url_df = url_df[url_df['field'] == SCIENTIFIC_TOPIC]

    # Clean the URLs and extract its domain name:
    url_df['url'] = url_df['url'].apply(lambda x: ural.normalize_url(
        x, strip_protocol=False, strip_trailing_slash=True))
    url_df['domain_name'] = url_df['url'].apply(
        lambda x: ural.get_domain_name(x))

    # Remove the URLs that are in double in the dataframe,
    # keeping only the first, i.e. the more recent ocurrence.
    url_df = url_df.drop_duplicates(subset="url", keep="first")

    # # Remove the plateforms from the analysis:
    # plateforms = ["facebook.com", "youtube.com", "twitter.com", "wordpress.com", "instagram.com"]
    # url_df = url_df[~url_df['domain_name'].isin(plateforms)]

    # # Remove the url with parameters from the analysis because CT return wrong results for them:
    # url_df['parameter_in_url'] = url_df['url'].apply(lambda x: '?' in x)
    # url_df = url_df[url_df['parameter_in_url']==False]

    url_df = url_df[['url', 'Item reviewed', 'field', 'domain_name']]

    return url_df
示例#5
0
def clean_url_format(url_df):

    url_df['url'] = url_df['url'].transform(lambda x: x.strip())

    url_df['url_cleaned'] = url_df['url']\
        .apply(lambda x: ural.normalize_url(x,
                                            strip_protocol=False,
                                            strip_trailing_slash=True))
    url_df['domain_name'] = url_df['url_cleaned'].apply(
        lambda x: ural.get_domain_name(x))

    # Remove the URLs that are in double in the dataframe,
    # keeping only the first, i.e. the more recent ocurrence.
    url_df = url_df.drop_duplicates(subset="url", keep="first")

    return url_df
示例#6
0
def payloads_iter(iterator, key=None):
    for item in iterator:
        url = item if key is None else key(item)

        if not url:
            yield FetchWorkerPayload(
                item=item,
                domain=None,
                url=None
            )

            continue

        # Url cleanup
        url = ensure_protocol(url.strip())

        yield FetchWorkerPayload(
            item=item,
            domain=get_domain_name(url),
            url=url
        )
示例#7
0
def url_parse_action(namespace):

    output_file = open_output_file(namespace.output)

    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 add=REPORT_HEADERS,
                                 keep=namespace.select)

    loading_bar = tqdm(desc='Parsing',
                       dynamic_ncols=True,
                       unit=' rows',
                       total=namespace.total)

    for row, url in enricher.cells(namespace.column, with_rows=True):
        url = url.strip()

        loading_bar.update()

        if namespace.separator:
            urls = url.split(namespace.separator)
        else:
            urls = [url]

        for url in urls:
            if not is_url(url, allow_spaces_in_path=True):
                enricher.writerow(row)
                continue

            enricher.writerow(row, [
                normalize_url(url,
                              strip_protocol=namespace.strip_protocol,
                              strip_trailing_slash=True),
                get_domain_name(url),
                get_hostname(url),
                get_normalized_hostname(url)
            ])

    output_file.close()
示例#8
0
    def grouper(payload):
        if payload.url is None:
            return

        return get_domain_name(payload.url)
示例#9
0
 def grouper(job):
     return get_domain_name(job.url)
示例#10
0
 def test_basics(self):
     for url, domain in TESTS:
         assert get_domain_name(url) == domain