示例#1
0
def createDocument(dicts):
    #load proper doc
    #sanitize relevant data
    #check times
    #check tweets
    #check tweet @s
    #check hashes
    #check users
    #check geocodes
    #check rate
    file = open('Ground_Truth.txt','r')
    dFileName="test_result.txt"
    dFile = open(dFileName,'w')
    counter=0
    english_counter=0
    count=0
    longer_line=[]
    true_pos=0
    false_pos=0
    false_neg=0
    true_neg=0
    fb_num=0
    #fileObj = codecs.open( dFileName, "r", "utf-8" )
    for line in file:
        #line=unicode(line,errors='replace')
        '''count+=1
        if count==20:
            break'''
        #print(line)
        weight=0
        #print(line)
        line = line.strip('\n')
        if line=="":
            blip=0
        else:
            line = line.lower()
            line2 = line.split('\t')
            #print(str(line2) + " " + str(len(line2)))
            term_true=False
            name_true=False
            hashes_true=False
            at_true=False
            if (line2[0].find("sun oct")!=-1 or line2[0].find("sun nov")!=-1 or line2[0].find("sun dec")!=-1) and len(line2)==8:
                time = line2[0]
                user_name = line2[1]
                tweet = line2[2]
                geo_code = line2[3]
                hashes = line2[4]
                english = int(line2[5])
                fb_true = int(line2[6])
                association = line2[7]
                metrics=weighting.decide_weight(user_name,tweet,hashes,dicts)
                #print(str(counter) + str(line2) + " P " + str(metrics))
                weight=metrics[0]
                teams=metrics[1]
                counter+=1
                if english==1:
                    english_counter+=1
                if weight>1:
                    if fb_true==1:
                        true_pos+=1
                        fb_num+=1
                    else:
                        false_pos+=1
                else:
                    if fb_true==1:
                        print("X " + str(weight)+ " " +tweet)
                        false_neg+=1
                        fb_num+=1
                    else:
                        true_neg+=1
                new_line=time+"\t"+user_name+"\t"+tweet+"\t"+geo_code+"\t"+hashes+"\t"+str(english)+"\t"+str(fb_true)+"\t"+str(association)+"\t"+str(weight)+"\t"+str(teams)+"\n"
                dFile.write(new_line)
    total_pos=true_pos+false_pos
    total_neg=true_neg+false_neg
    print("Total Tweets Checked: "+str(counter))
    print("Total English Tweets Checked: "+str(english_counter))
    print("Total Football Tweets in Corpus: " + str(fb_num))
    print("Num True Positives/Total Positives: " +str(true_pos)+"/"+str(total_pos))
    print("Num False Negatives/Total Negatives: " +str(false_neg)+"/"+str(total_neg))
示例#2
0
def createDocument(dicts):
    #load proper doc
    #sanitize relevant data
    #check times
    #check tweets
    #check tweet @s
    #check hashes
    #check users
    #check geocodes
    #check rate
    
    '''test_loc="/Users/jacobportnoff/Dropbox/290-twitter/data/11_11_10.txt"
    test_file = open(test_loc, 'r')'''
    data_loc="/Users/jacobportnoff/Dropbox/290-twitter/data/"
    jdata_loc="/Users/jacobportnoff/Dropbox/290-twitter/Jdata/"
    storage_loc="/Users/jacobportnoff/Dropbox/290-twitter/Wdata/"
    storage_file="indexed_tweets2_w"
    txt=".txt"
    week_num=0
    week_holder=""
    data_lines=[]
    jdata_lines=[]
    '''for filename in os.listdir(data_loc):
        print(filename)
        if filename!=".DS_Store":
            data = open(data_loc+filename,'r')
            for line in data:
                data_lines.append(line)
            data.close()
         and filename2!="week1SunJ.txt" and filename2!="week2SunJ.txt" and filename2!="week3SunJ.txt"'''
    for filename2 in os.listdir(jdata_loc):
        if filename2!=".DS_Store":
            print(filename2)
            data = open(jdata_loc+filename2,'r')
            jdata = data.readlines()
            for line in jdata:
                if line!=jdata[-1]:
                    jdata_lines.append(line)
            data.close()
    print("game")
    '''return False
    
    file = open('week2SunJ.txt','r')
    dFileName=storage_loc+storage_file+str(week_num)+txt
    dFile = open(dFileName,'w')'''
    counter=0#start of week 4 5900137
    english_counter=0
    count=0
    longer_line=[]
    true_pos=0
    false_pos=0
    false_neg=0
    true_neg=0
    fb_num=0
    #fileObj = codecs.open( dFileName, "r", "utf-8" )
    for line in jdata_lines:
        #line=unicode(line,errors='replace')
        '''count+=1
        if count==20:
            break
        print(line)'''
        weight=0
        #print(line)
        line = line.strip('\n')
        if line=="":
            blip=0
        else:
            line = line.lower()
            line2 = line.split('\t')
            #print(str(line2) + " " + str(len(line2)))
            term_true=False
            name_true=False
            hashes_true=False
            at_true=False
            if (line2[0].find("sun oct")!=-1 or line2[0].find("sun nov")!=-1 or line2[0].find("sun dec")!=-1) and len(line2)==5:
                time = line2[0]
                holder = time.split()
                if week_holder!=holder[2]:
                    week_holder = holder[2]
                    if week_num!=0:
                        dFile.close()
                    week_num+=1
                    dFileName=storage_loc+storage_file+str(week_num)+txt
                    dFile = open(dFileName,'w')
                user_name = line2[1]
                tweet = line2[2]
                geo_code = line2[3]
                hashes = line2[4]
                metrics=weighting.decide_weight(user_name,tweet,hashes,dicts)
                #print(str(counter) + str(line2) + " P " + str(metrics))
                weight=metrics[0]
                teams=metrics[1]
                fb_teams="null"
                if len(teams)!=0:
                    fb_teams=""
                    for el in teams:
                        fb_teams+=el+" "
                new_line=str(counter)+"\t"+time+"\t"+user_name+"\t"+tweet+"\t"+geo_code+"\t"+hashes+"\t"+str(weight)+"\t"+fb_teams+"\n"
                dFile.write(new_line)
                if counter%10000==0:
                    print("check counter "+str(counter))
                counter+=1
            elif line2[0].find("sun oct")!=-1 or line2[0].find("sun nov")!=-1 or line2[0].find("sun dec")!=-1:
                #print(line2)
                longer_line=line2
                #print(longer_line)
            elif line2[0].find("sun oct")==-1 or line2[0].find("sun nov")==-1 or line2[0].find("sun dec")==-1:
                #print(line2)
                if len(line2)==1:
                    longer_line[2]+=line2[0]
                elif len(line2)>3:
                    longer_line[2]+=line2[0]
                    longer_line.append(line2[1])
                    longer_line.append(line2[2])
                    line2=longer_line
                    time = line2[0]
                    holder = time.split()
                    if week_holder!=holder[2]:
                        week_holder = holder[2]
                        if week_num!=3:
                            dFile.close()
                        week_num+=1
                        dFileName=storage_loc+storage_file+str(week_num)+txt
                        dFile = open(dFileName,'w')
                    user_name = line2[1]
                    tweet = line2[2]
                    geo_code = line2[3]
                    hashes = line2[4]
                    metrics=weighting.decide_weight(user_name,tweet,hashes,dicts)
                    #print(str(counter) + str(line2) + " P " + str(metrics))
                    weight=metrics[0]
                    teams=metrics[1]
                    fb_teams="null"
                    if len(teams)!=0:
                        fb_teams=""
                        for el in teams:
                            fb_teams+=el+" "
                    new_line=str(counter)+"\t"+time+"\t"+user_name+"\t"+tweet+"\t"+geo_code+"\t"+hashes+"\t"+str(weight)+"\t"+fb_teams+"\n"
                    dFile.write(new_line)
                    if counter%10000==0:
                        print("check counter "+str(counter))
                    counter+=1
                    long_line=[]
示例#3
0
def createDocument(dicts):
    #load proper doc
    #sanitize relevant data
    #check times
    #check tweets
    #check tweet @s
    #check hashes
    #check users
    #check geocodes
    #check rate
    '''test_loc="/Users/jacobportnoff/Dropbox/290-twitter/data/11_11_10.txt"
    test_file = open(test_loc, 'r')'''
    data_loc = "/Users/jacobportnoff/Dropbox/290-twitter/data/"
    jdata_loc = "/Users/jacobportnoff/Dropbox/290-twitter/Jdata/"
    storage_loc = "/Users/jacobportnoff/Dropbox/290-twitter/Wdata/"
    storage_file = "indexed_tweets2_w"
    txt = ".txt"
    week_num = 0
    week_holder = ""
    data_lines = []
    jdata_lines = []
    '''for filename in os.listdir(data_loc):
        print(filename)
        if filename!=".DS_Store":
            data = open(data_loc+filename,'r')
            for line in data:
                data_lines.append(line)
            data.close()
         and filename2!="week1SunJ.txt" and filename2!="week2SunJ.txt" and filename2!="week3SunJ.txt"'''
    for filename2 in os.listdir(jdata_loc):
        if filename2 != ".DS_Store":
            print(filename2)
            data = open(jdata_loc + filename2, 'r')
            jdata = data.readlines()
            for line in jdata:
                if line != jdata[-1]:
                    jdata_lines.append(line)
            data.close()
    print("game")
    '''return False
    
    file = open('week2SunJ.txt','r')
    dFileName=storage_loc+storage_file+str(week_num)+txt
    dFile = open(dFileName,'w')'''
    counter = 0  #start of week 4 5900137
    english_counter = 0
    count = 0
    longer_line = []
    true_pos = 0
    false_pos = 0
    false_neg = 0
    true_neg = 0
    fb_num = 0
    #fileObj = codecs.open( dFileName, "r", "utf-8" )
    for line in jdata_lines:
        #line=unicode(line,errors='replace')
        '''count+=1
        if count==20:
            break
        print(line)'''
        weight = 0
        #print(line)
        line = line.strip('\n')
        if line == "":
            blip = 0
        else:
            line = line.lower()
            line2 = line.split('\t')
            #print(str(line2) + " " + str(len(line2)))
            term_true = False
            name_true = False
            hashes_true = False
            at_true = False
            if (line2[0].find("sun oct") != -1
                    or line2[0].find("sun nov") != -1
                    or line2[0].find("sun dec") != -1) and len(line2) == 5:
                time = line2[0]
                holder = time.split()
                if week_holder != holder[2]:
                    week_holder = holder[2]
                    if week_num != 0:
                        dFile.close()
                    week_num += 1
                    dFileName = storage_loc + storage_file + str(
                        week_num) + txt
                    dFile = open(dFileName, 'w')
                user_name = line2[1]
                tweet = line2[2]
                geo_code = line2[3]
                hashes = line2[4]
                metrics = weighting.decide_weight(user_name, tweet, hashes,
                                                  dicts)
                #print(str(counter) + str(line2) + " P " + str(metrics))
                weight = metrics[0]
                teams = metrics[1]
                fb_teams = "null"
                if len(teams) != 0:
                    fb_teams = ""
                    for el in teams:
                        fb_teams += el + " "
                new_line = str(
                    counter
                ) + "\t" + time + "\t" + user_name + "\t" + tweet + "\t" + geo_code + "\t" + hashes + "\t" + str(
                    weight) + "\t" + fb_teams + "\n"
                dFile.write(new_line)
                if counter % 10000 == 0:
                    print("check counter " + str(counter))
                counter += 1
            elif line2[0].find("sun oct") != -1 or line2[0].find(
                    "sun nov") != -1 or line2[0].find("sun dec") != -1:
                #print(line2)
                longer_line = line2
                #print(longer_line)
            elif line2[0].find("sun oct") == -1 or line2[0].find(
                    "sun nov") == -1 or line2[0].find("sun dec") == -1:
                #print(line2)
                if len(line2) == 1:
                    longer_line[2] += line2[0]
                elif len(line2) > 3:
                    longer_line[2] += line2[0]
                    longer_line.append(line2[1])
                    longer_line.append(line2[2])
                    line2 = longer_line
                    time = line2[0]
                    holder = time.split()
                    if week_holder != holder[2]:
                        week_holder = holder[2]
                        if week_num != 3:
                            dFile.close()
                        week_num += 1
                        dFileName = storage_loc + storage_file + str(
                            week_num) + txt
                        dFile = open(dFileName, 'w')
                    user_name = line2[1]
                    tweet = line2[2]
                    geo_code = line2[3]
                    hashes = line2[4]
                    metrics = weighting.decide_weight(user_name, tweet, hashes,
                                                      dicts)
                    #print(str(counter) + str(line2) + " P " + str(metrics))
                    weight = metrics[0]
                    teams = metrics[1]
                    fb_teams = "null"
                    if len(teams) != 0:
                        fb_teams = ""
                        for el in teams:
                            fb_teams += el + " "
                    new_line = str(
                        counter
                    ) + "\t" + time + "\t" + user_name + "\t" + tweet + "\t" + geo_code + "\t" + hashes + "\t" + str(
                        weight) + "\t" + fb_teams + "\n"
                    dFile.write(new_line)
                    if counter % 10000 == 0:
                        print("check counter " + str(counter))
                    counter += 1
                    long_line = []
示例#4
0
def createDocument(dicts):
    #load proper doc
    #sanitize relevant data
    #check times
    #check tweets
    #check tweet @s
    #check hashes
    #check users
    #check geocodes
    #check rate
    file = open('Ground_Truth.txt', 'r')
    dFileName = "test_result.txt"
    dFile = open(dFileName, 'w')
    counter = 0
    english_counter = 0
    count = 0
    longer_line = []
    true_pos = 0
    false_pos = 0
    false_neg = 0
    true_neg = 0
    fb_num = 0
    #fileObj = codecs.open( dFileName, "r", "utf-8" )
    for line in file:
        #line=unicode(line,errors='replace')
        '''count+=1
        if count==20:
            break'''
        #print(line)
        weight = 0
        #print(line)
        line = line.strip('\n')
        if line == "":
            blip = 0
        else:
            line = line.lower()
            line2 = line.split('\t')
            #print(str(line2) + " " + str(len(line2)))
            term_true = False
            name_true = False
            hashes_true = False
            at_true = False
            if (line2[0].find("sun oct") != -1
                    or line2[0].find("sun nov") != -1
                    or line2[0].find("sun dec") != -1) and len(line2) == 8:
                time = line2[0]
                user_name = line2[1]
                tweet = line2[2]
                geo_code = line2[3]
                hashes = line2[4]
                english = int(line2[5])
                fb_true = int(line2[6])
                association = line2[7]
                metrics = weighting.decide_weight(user_name, tweet, hashes,
                                                  dicts)
                #print(str(counter) + str(line2) + " P " + str(metrics))
                weight = metrics[0]
                teams = metrics[1]
                counter += 1
                if english == 1:
                    english_counter += 1
                if weight > 1:
                    if fb_true == 1:
                        true_pos += 1
                        fb_num += 1
                    else:
                        false_pos += 1
                else:
                    if fb_true == 1:
                        print("X " + str(weight) + " " + tweet)
                        false_neg += 1
                        fb_num += 1
                    else:
                        true_neg += 1
                new_line = time + "\t" + user_name + "\t" + tweet + "\t" + geo_code + "\t" + hashes + "\t" + str(
                    english) + "\t" + str(fb_true) + "\t" + str(
                        association) + "\t" + str(weight) + "\t" + str(
                            teams) + "\n"
                dFile.write(new_line)
    total_pos = true_pos + false_pos
    total_neg = true_neg + false_neg
    print("Total Tweets Checked: " + str(counter))
    print("Total English Tweets Checked: " + str(english_counter))
    print("Total Football Tweets in Corpus: " + str(fb_num))
    print("Num True Positives/Total Positives: " + str(true_pos) + "/" +
          str(total_pos))
    print("Num False Negatives/Total Negatives: " + str(false_neg) + "/" +
          str(total_neg))