예제 #1
0
    def get_locations(self, tweet):
        tweet = cleaner.clean(tweet)
        tagged_chunked_tweet = self.cp.parse(self.tnt_pos_tagger.tag(nltk.word_tokenize(tweet)))

        locations = []
        for subtree in tagged_chunked_tweet.subtrees():
            if subtree.label() == 'LOC':
                location = []
                for leave in subtree.leaves():
                    location.append(leave[0])
                locations.append(' '.join(location))

        return locations
args = parser.parse_args()

sm = SequenceMatcher(lambda x: x == " ")

ngrams = 1

progress = 0

results = []

with open(os.path.join(os.path.dirname(__file__), 'aaaaa.csv'),
          newline='\n') as csv_input:
    dataset = csv.reader(csv_input, delimiter=',', quotechar='"')
    tweets = [(line[0], line[1], line[2], line[3]) for line in dataset]

cleaned_tweets = [(time, tweet, cleaner.clean(tweet), time2, tweet2,
                   cleaner.clean(tweet2))
                  for (time, tweet, time2, tweet2) in tweets]
tokenized_tweets = [
    (time, tweet, cleaned, tokenizer.ngrams_tokenizer(cleaned, ngrams), time2,
     tweet2, cleaned2, tokenizer.ngrams_tokenizer(cleaned2, ngrams))
    for (time, tweet, cleaned, time2, tweet2, cleaned2) in cleaned_tweets
]

# tfidf_input =
# tfidf_obj = tfidf.TFIDF(cleaned_tweets)

for (time, tweet, cleaned, tokens, time2, tweet2, cleaned2,
     tokens2) in tokenized_tweets:
    progress += 1
    print('\r{}/{}'.format(progress, len(tokenized_tweets)), end='')
예제 #3
0
                    help='File name for output CSV, e.g. output.csv')
args = parser.parse_args()

threshold = 0.55

progress = 0

results = []

with open(os.path.join(os.path.dirname(__file__),
                       'tweets_corpus/tweet-2016-07-06-clean.csv'),
          newline='\n') as csv_input:
    dataset = csv.reader(csv_input, delimiter=',', quotechar='"')
    tweets = [(line[0], line[1], line[2]) for line in dataset]

cleaned_tweets = [(time, tweet, category, cleaner.clean(tweet))
                  for (time, tweet, category) in tweets]

for (time, tweet, category, cleaned) in cleaned_tweets:
    progress += 1
    print('\r{}/{}'.format(progress, len(cleaned_tweets)), end='')

    result = []

    for (time2, tweet2, category2, cleaned2) in cleaned_tweets:
        dt = datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
        dt2 = datetime.strptime(time2, '%Y-%m-%d %H:%M:%S')

        if category2 == 'new' and dt > dt2:
            # time_diff = dt - dt2
            sm.set_seqs(cleaned, cleaned2)
def calculate(hours):
    results = []
    calculation = calculations[args.calculation]
    for ngrams in range(1, 25):  # 1-6
        for threshold in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9,
                          1.0]:  # 0.1-1.0
            start_time = tm.time()

            cleaned = [(time, tweet, category, cleaner.clean(tweet))
                       for (time, tweet, category) in tweets]
            tokenized = [(time, tweet, category,
                          tokenizer.ngrams_tokenizer(cleaned_tweets, ngrams))
                         for (time, tweet, category, cleaned_tweets) in cleaned
                         ]

            distincts = []
            tp, tn, fp, fn = 0, 0, 0, 0

            for (time, tweet, category, tokens) in tokenized:
                if len(distincts) == 0:
                    distincts.append((time, tweet, tokens))
                else:
                    is_distinct = {'text': True, 'tl': True}
                    for (distinct_time, distinct_tweet,
                         distinct_tokens) in distincts:
                        dt = datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
                        distinct_dt = datetime.strptime(
                            distinct_time, '%Y-%m-%d %H:%M:%S')
                        time_diff = dt - distinct_dt

                        if time_diff > timedelta(hours=hours):
                            distincts.remove((distinct_time, distinct_tweet,
                                              distinct_tokens))
                            continue

                        index = calculation.index(tokens, distinct_tokens)
                        if index >= threshold:
                            is_distinct['text'] = False

                        if t.is_text_similar(
                                tweet,
                                distinct_tweet) and l.is_first_loc_similar(
                                    tweet, distinct_tweet):
                            is_distinct['tl'] = False

                    if is_distinct['text'] or is_distinct['tl']:
                        distincts.append((time, tweet, tokens))

                        if category == 'new':
                            tp += 1
                        else:
                            fp += 1
                    else:
                        if category == 'new':
                            fn += 1
                        else:
                            tn += 1

            time_elapsed = tm.time() - start_time
            accuracy = (tp + tn) / (tp + tn + fp + fn)
            precision = tp / (tp + fp)
            recall = tp / (tp + fn)
            fscore = 2 * (precision * recall) / (precision + recall)

            print()
            print('Limit hours: {}'.format(hours))
            print('Calculation: {}'.format(args.calculation))
            print('Ngrams: {}'.format(ngrams))
            print('Threshold: {}'.format(threshold))
            print('True positive: {}'.format(tp))
            print('True negative: {}'.format(tn))
            print('False positive: {}'.format(fp))
            print('False negative: {}'.format(fn))
            print('Accuracy: {}'.format(accuracy))
            print('Precison: {}'.format(precision))
            print('Recall: {}'.format(recall))
            print('F-score: {}'.format(fscore))
            print('Time elapsed: {}'.format(time_elapsed))

            results.append([
                args.calculation, hours, ngrams, threshold, tp, tn, fp, fn,
                accuracy, precision, recall, fscore, time_elapsed
            ])
    return results
예제 #5
0
    # ('extended_jaccard', ExtendedJaccard()),
    ('dice', Dice()),
    ('manhattan', Manhattan()),
    # ('euclidean', Euclidean()),
    ('overlap', Overlap()),
    # ('pearson', Pearson()),
    # ('combination', Combination())
]

hours = 12

with open(os.path.join(os.path.dirname(__file__), 'tweets_corpus/similarity_dataset_15028.csv'), newline='\n') as csv_input:
    dataset = csv.reader(csv_input, delimiter=',', quotechar='"')
    tweets = [(line[0], line[1], line[2]) for line in dataset]

cleaned_tweets = [(time, tweet, category, cleaner.clean(tweet)) for (time, tweet, category) in tweets]
tfidf = TFIDF(cleaned_tweets)

def calculate(calculation):
    results = []
    for threshold in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]: # 0.1-1.0
        start_time = tm.time()

        distincts = []
        tp, tn, fp, fn = 0, 0, 0, 0

        for (time, tweet, category, cleaned) in cleaned_tweets:
            if len(distincts) == 0:
                distincts.append((time, tweet, cleaned))
            else:
                is_distinct = True
def calculate(hours):
    results = []
    for name, calculation in calculations.items():
        for ngrams in range(1, 7):  # 1-6
            for threshold in numpy.arange(0.1, 1.1, 0.1):  # 0.1-1.0
                start_time = tm.time()

                cleaned = [(time, tweet, category, cleaner.clean(tweet))
                           for (time, tweet, category) in tweets]
                tokenized = [
                    (time, tweet, category,
                     tokenizer.ngrams_tokenizer(cleaned_tweets, ngrams))
                    for (time, tweet, category, cleaned_tweets) in cleaned
                ]

                distincts = []
                tp, tn, fp, fn = 0, 0, 0, 0

                for (time, tweet, category, tokens) in tokenized:
                    if len(distincts) == 0:
                        distincts.append((time, tweet, tokens))
                    else:
                        is_distinct = True
                        for (distinct_time, distinct_tweet,
                             distinct_tokens) in distincts:
                            dt = datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
                            distinct_dt = datetime.strptime(
                                distinct_time, '%Y-%m-%d %H:%M:%S')
                            time_diff = dt - distinct_dt

                            if time_diff > timedelta(hours=hours):
                                distincts.remove(
                                    (distinct_time, distinct_tweet,
                                     distinct_tokens))
                                continue

                            index = calculation.index(tokens, distinct_tokens)
                            if index >= threshold:
                                is_distinct = False
                                break

                        if is_distinct:
                            distincts.append((time, tweet, tokens))

                            if category == 'new':
                                tp += 1
                            else:
                                fp += 1
                        else:
                            if category == 'new':
                                fn += 1
                            else:
                                tn += 1

                time_elapsed = tm.time() - start_time
                accuracy = (tp + tn) / (tp + tn + fp + fn)
                precision = tp / (tp + fp)
                recall = tp / (tp + fn)
                fscore = 2 * (precision * recall) / (precision + recall)

                print()
                print('Limit hours: {}'.format(hours))
                print('Calculation: {}'.format(name))
                print('Ngrams: {}'.format(ngrams))
                print('Threshold: {}'.format(threshold))
                print('True positive: {}'.format(tp))
                print('True negative: {}'.format(tn))
                print('False positive: {}'.format(fp))
                print('False negative: {}'.format(fn))
                print('Accuracy: {}'.format(accuracy))
                print('Precison: {}'.format(precision))
                print('Recall: {}'.format(recall))
                print('F-score: {}'.format(fscore))
                print('Time elapsed: {}'.format(time_elapsed))

                results.append([
                    name, hours, ngrams, threshold, tp, tn, fp, fn, accuracy,
                    precision, recall, fscore, time_elapsed
                ])
    return results
예제 #7
0
parser.add_argument('-t', '--threshold', type=float, default=0.6, help='Threshold index, default: 0.6')
parser.add_argument('-a', '--algo', type=str, default='jaccard', help='Algorithm: jaccard, cosine')
args = parser.parse_args()

if args.algo == 'jaccard':
    algo = Jaccard()
elif args.algo == 'cosine':
    algo = Cosine()
else:
    raise Exception('Algo not defined')

with open(os.path.join(os.path.dirname(__file__), 'tweets_corpus/similarity-dataset15075.csv'), newline='\n') as csv_input:
    dataset = csv.reader(csv_input, delimiter=',', quotechar='"')
    tweets = [(line[0], line[1]) for line in dataset]

cleaned = [(time, tweet, cleaner.clean(tweet)) for (time, tweet) in tweets]
tokenized = [(time, tweet, tokenizer.ngrams_tokenizer(cleaned_tweets, args.ngrams)) for (time, tweet, cleaned_tweets) in cleaned]

distincts = []
progress = 0
with open(os.path.join(os.path.dirname(__file__), args.output), 'w', newline='\n') as csv_output:
    csv_writer = csv.writer(csv_output, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
    for (time, tweet, tokens) in tokenized:
        progress += 1
        print('\r{}/{}'.format(progress, len(tokenized)), end='')
        if len(distincts) == 0:
            distincts.append((time, tweet, tokens))
            csv_writer.writerow([time, tweet, '[{}]'.format(','.join(tokens))])
        else:
            is_distinct = True
            for (distinct_time, distinct_tweet, distinct_tokens) in distincts: